From 1b03c5bf41222b723415638f03e00ed12cac076a Mon Sep 17 00:00:00 2001 From: pjd Date: Sun, 27 Feb 2011 19:41:40 +0000 Subject: Finally... Import the latest open-source ZFS version - (SPA) 28. Few new things available from now on: - Data deduplication. - Triple parity RAIDZ (RAIDZ3). - zfs diff. - zpool split. - Snapshot holds. - zpool import -F. Allows to rewind corrupted pool to earlier transaction group. - Possibility to import pool in read-only mode. MFC after: 1 month --- .../contrib/opensolaris/uts/common/Makefile.files | 22 +- sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c | 153 +- sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c | 5 +- .../contrib/opensolaris/uts/common/fs/zfs/arc.c | 733 +++-- .../contrib/opensolaris/uts/common/fs/zfs/bplist.c | 316 +- .../contrib/opensolaris/uts/common/fs/zfs/bpobj.c | 495 ++++ .../contrib/opensolaris/uts/common/fs/zfs/dbuf.c | 962 ++++-- .../contrib/opensolaris/uts/common/fs/zfs/ddt.c | 1152 ++++++++ .../opensolaris/uts/common/fs/zfs/ddt_zap.c | 156 + .../contrib/opensolaris/uts/common/fs/zfs/dmu.c | 911 ++++-- .../opensolaris/uts/common/fs/zfs/dmu_diff.c | 245 ++ .../opensolaris/uts/common/fs/zfs/dmu_object.c | 34 +- .../opensolaris/uts/common/fs/zfs/dmu_objset.c | 1196 +++++--- .../opensolaris/uts/common/fs/zfs/dmu_send.c | 1030 +++++-- .../opensolaris/uts/common/fs/zfs/dmu_traverse.c | 244 +- .../contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c | 373 ++- .../opensolaris/uts/common/fs/zfs/dmu_zfetch.c | 10 +- .../contrib/opensolaris/uts/common/fs/zfs/dnode.c | 683 ++++- .../opensolaris/uts/common/fs/zfs/dnode_sync.c | 87 +- .../opensolaris/uts/common/fs/zfs/dsl_dataset.c | 2080 +++++++++---- .../opensolaris/uts/common/fs/zfs/dsl_deadlist.c | 474 +++ .../opensolaris/uts/common/fs/zfs/dsl_deleg.c | 57 +- .../opensolaris/uts/common/fs/zfs/dsl_dir.c | 281 +- .../opensolaris/uts/common/fs/zfs/dsl_pool.c | 376 ++- .../opensolaris/uts/common/fs/zfs/dsl_prop.c | 866 ++++-- .../opensolaris/uts/common/fs/zfs/dsl_scan.c | 1766 +++++++++++ .../opensolaris/uts/common/fs/zfs/dsl_scrub.c | 1060 ------- .../opensolaris/uts/common/fs/zfs/dsl_synctask.c | 61 +- .../opensolaris/uts/common/fs/zfs/fletcher.c | 245 -- .../contrib/opensolaris/uts/common/fs/zfs/lzjb.c | 37 +- .../opensolaris/uts/common/fs/zfs/metaslab.c | 344 ++- .../opensolaris/uts/common/fs/zfs/refcount.c | 40 +- .../contrib/opensolaris/uts/common/fs/zfs/sa.c | 1970 +++++++++++++ .../contrib/opensolaris/uts/common/fs/zfs/sha256.c | 123 +- .../contrib/opensolaris/uts/common/fs/zfs/spa.c | 3076 ++++++++++++++------ .../opensolaris/uts/common/fs/zfs/spa_config.c | 111 +- .../opensolaris/uts/common/fs/zfs/spa_errlog.c | 45 +- .../opensolaris/uts/common/fs/zfs/spa_history.c | 150 +- .../opensolaris/uts/common/fs/zfs/spa_misc.c | 454 ++- .../opensolaris/uts/common/fs/zfs/space_map.c | 11 +- .../opensolaris/uts/common/fs/zfs/sys/arc.h | 34 +- .../opensolaris/uts/common/fs/zfs/sys/bplist.h | 60 +- .../opensolaris/uts/common/fs/zfs/sys/bpobj.h | 91 + .../opensolaris/uts/common/fs/zfs/sys/dbuf.h | 93 +- .../opensolaris/uts/common/fs/zfs/sys/ddt.h | 246 ++ .../opensolaris/uts/common/fs/zfs/sys/dmu.h | 208 +- .../opensolaris/uts/common/fs/zfs/sys/dmu_impl.h | 40 +- .../opensolaris/uts/common/fs/zfs/sys/dmu_objset.h | 100 +- .../uts/common/fs/zfs/sys/dmu_traverse.h | 21 +- .../opensolaris/uts/common/fs/zfs/sys/dmu_tx.h | 15 +- .../opensolaris/uts/common/fs/zfs/sys/dnode.h | 76 +- .../uts/common/fs/zfs/sys/dsl_dataset.h | 114 +- .../uts/common/fs/zfs/sys/dsl_deadlist.h | 87 + .../opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h | 7 +- .../opensolaris/uts/common/fs/zfs/sys/dsl_dir.h | 18 +- .../opensolaris/uts/common/fs/zfs/sys/dsl_pool.h | 62 +- .../opensolaris/uts/common/fs/zfs/sys/dsl_prop.h | 52 +- .../opensolaris/uts/common/fs/zfs/sys/dsl_scan.h | 108 + .../uts/common/fs/zfs/sys/dsl_synctask.h | 8 +- .../opensolaris/uts/common/fs/zfs/sys/metaslab.h | 22 +- .../uts/common/fs/zfs/sys/metaslab_impl.h | 10 +- .../opensolaris/uts/common/fs/zfs/sys/refcount.h | 17 +- .../contrib/opensolaris/uts/common/fs/zfs/sys/sa.h | 171 ++ .../opensolaris/uts/common/fs/zfs/sys/sa_impl.h | 287 ++ .../opensolaris/uts/common/fs/zfs/sys/spa.h | 265 +- .../opensolaris/uts/common/fs/zfs/sys/spa_boot.h | 5 +- .../opensolaris/uts/common/fs/zfs/sys/spa_impl.h | 83 +- .../opensolaris/uts/common/fs/zfs/sys/txg.h | 11 +- .../opensolaris/uts/common/fs/zfs/sys/txg_impl.h | 6 +- .../opensolaris/uts/common/fs/zfs/sys/uberblock.h | 10 +- .../uts/common/fs/zfs/sys/uberblock_impl.h | 11 +- .../opensolaris/uts/common/fs/zfs/sys/vdev.h | 37 +- .../opensolaris/uts/common/fs/zfs/sys/vdev_impl.h | 39 +- .../opensolaris/uts/common/fs/zfs/sys/zap.h | 72 +- .../opensolaris/uts/common/fs/zfs/sys/zap_impl.h | 28 +- .../opensolaris/uts/common/fs/zfs/sys/zap_leaf.h | 35 +- .../opensolaris/uts/common/fs/zfs/sys/zfs_acl.h | 38 +- .../uts/common/fs/zfs/sys/zfs_context.h | 9 +- .../opensolaris/uts/common/fs/zfs/sys/zfs_debug.h | 15 +- .../opensolaris/uts/common/fs/zfs/sys/zfs_dir.h | 6 +- .../opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h | 5 +- .../opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h | 172 +- .../opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h | 66 + .../opensolaris/uts/common/fs/zfs/sys/zfs_sa.h | 142 + .../opensolaris/uts/common/fs/zfs/sys/zfs_stat.h | 55 + .../opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h | 24 +- .../opensolaris/uts/common/fs/zfs/sys/zfs_znode.h | 116 +- .../opensolaris/uts/common/fs/zfs/sys/zil.h | 134 +- .../opensolaris/uts/common/fs/zfs/sys/zil_impl.h | 63 +- .../opensolaris/uts/common/fs/zfs/sys/zio.h | 330 ++- .../uts/common/fs/zfs/sys/zio_checksum.h | 26 +- .../uts/common/fs/zfs/sys/zio_compress.h | 16 +- .../opensolaris/uts/common/fs/zfs/sys/zio_impl.h | 182 +- .../opensolaris/uts/common/fs/zfs/sys/zrlock.h | 66 + .../opensolaris/uts/common/fs/zfs/sys/zvol.h | 28 +- .../contrib/opensolaris/uts/common/fs/zfs/txg.c | 175 +- .../opensolaris/uts/common/fs/zfs/uberblock.c | 6 +- .../contrib/opensolaris/uts/common/fs/zfs/vdev.c | 990 +++++-- .../opensolaris/uts/common/fs/zfs/vdev_cache.c | 6 +- .../opensolaris/uts/common/fs/zfs/vdev_disk.c | 119 +- .../opensolaris/uts/common/fs/zfs/vdev_file.c | 45 +- .../opensolaris/uts/common/fs/zfs/vdev_geom.c | 171 +- .../opensolaris/uts/common/fs/zfs/vdev_label.c | 174 +- .../opensolaris/uts/common/fs/zfs/vdev_mirror.c | 35 +- .../opensolaris/uts/common/fs/zfs/vdev_missing.c | 21 +- .../opensolaris/uts/common/fs/zfs/vdev_queue.c | 137 +- .../opensolaris/uts/common/fs/zfs/vdev_raidz.c | 1644 ++++++++--- .../opensolaris/uts/common/fs/zfs/vdev_root.c | 20 +- .../contrib/opensolaris/uts/common/fs/zfs/zap.c | 244 +- .../opensolaris/uts/common/fs/zfs/zap_leaf.c | 171 +- .../opensolaris/uts/common/fs/zfs/zap_micro.c | 539 +++- .../opensolaris/uts/common/fs/zfs/zfs_acl.c | 1290 ++++---- .../opensolaris/uts/common/fs/zfs/zfs_byteswap.c | 3 +- .../opensolaris/uts/common/fs/zfs/zfs_ctldir.c | 234 +- .../opensolaris/uts/common/fs/zfs/zfs_debug.c | 95 + .../opensolaris/uts/common/fs/zfs/zfs_dir.c | 247 +- .../contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c | 691 ++++- .../opensolaris/uts/common/fs/zfs/zfs_fuid.c | 64 +- .../opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 3068 ++++++++++++++----- .../opensolaris/uts/common/fs/zfs/zfs_log.c | 178 +- .../opensolaris/uts/common/fs/zfs/zfs_onexit.c | 252 ++ .../opensolaris/uts/common/fs/zfs/zfs_replay.c | 147 +- .../opensolaris/uts/common/fs/zfs/zfs_rlock.c | 8 +- .../contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c | 334 +++ .../opensolaris/uts/common/fs/zfs/zfs_vfsops.c | 863 ++++-- .../opensolaris/uts/common/fs/zfs/zfs_vnops.c | 2692 ++++++++++++----- .../opensolaris/uts/common/fs/zfs/zfs_znode.c | 1065 +++++-- .../contrib/opensolaris/uts/common/fs/zfs/zil.c | 1583 ++++++---- .../contrib/opensolaris/uts/common/fs/zfs/zio.c | 993 +++++-- .../opensolaris/uts/common/fs/zfs/zio_checksum.c | 140 +- .../opensolaris/uts/common/fs/zfs/zio_compress.c | 98 +- .../opensolaris/uts/common/fs/zfs/zio_inject.c | 181 +- .../contrib/opensolaris/uts/common/fs/zfs/zle.c | 86 + .../contrib/opensolaris/uts/common/fs/zfs/zrlock.c | 194 ++ .../contrib/opensolaris/uts/common/fs/zfs/zvol.c | 1995 +++++++++---- sys/cddl/contrib/opensolaris/uts/common/os/callb.c | 76 +- sys/cddl/contrib/opensolaris/uts/common/os/fm.c | 1402 +++++++++ sys/cddl/contrib/opensolaris/uts/common/sys/acl.h | 4 +- .../contrib/opensolaris/uts/common/sys/acl_impl.h | 2 +- sys/cddl/contrib/opensolaris/uts/common/sys/avl.h | 6 +- .../contrib/opensolaris/uts/common/sys/byteorder.h | 170 -- .../contrib/opensolaris/uts/common/sys/callb.h | 12 +- .../contrib/opensolaris/uts/common/sys/cpupart.h | 27 +- .../contrib/opensolaris/uts/common/sys/cpuvar.h | 112 +- sys/cddl/contrib/opensolaris/uts/common/sys/cred.h | 13 +- .../contrib/opensolaris/uts/common/sys/debug.h | 23 +- .../contrib/opensolaris/uts/common/sys/fm/fs/zfs.h | 13 + .../opensolaris/uts/common/sys/fm/protocol.h | 57 +- .../contrib/opensolaris/uts/common/sys/fm/util.h | 6 +- .../contrib/opensolaris/uts/common/sys/fs/zfs.h | 375 ++- .../contrib/opensolaris/uts/common/sys/fs/zut.h | 93 + sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h | 9 +- .../contrib/opensolaris/uts/common/sys/idmap.h | 10 +- .../contrib/opensolaris/uts/common/sys/isa_defs.h | 6 + .../contrib/opensolaris/uts/common/sys/nvpair.h | 10 +- .../contrib/opensolaris/uts/common/sys/processor.h | 3 +- .../contrib/opensolaris/uts/common/sys/sysevent.h | 132 +- .../opensolaris/uts/common/sys/sysevent/dev.h | 256 ++ .../uts/common/sys/sysevent/eventdefs.h | 52 +- .../contrib/opensolaris/uts/common/sys/sysmacros.h | 42 +- .../contrib/opensolaris/uts/common/sys/taskq.h | 8 + .../opensolaris/uts/common/sys/u8_textprep.h | 24 + .../contrib/opensolaris/uts/common/sys/vnode.h | 30 +- 163 files changed, 37659 insertions(+), 12788 deletions(-) create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c delete mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c delete mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/os/fm.c delete mode 100644 sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h (limited to 'sys/cddl/contrib/opensolaris/uts') diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 2aaf5bc..2ab1d7b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -20,8 +20,8 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# # # This Makefile defines all file modules for the directory uts/common # and its children. These are the source files which may be considered @@ -30,8 +30,12 @@ ZFS_COMMON_OBJS += \ arc.o \ bplist.o \ + bpobj.o \ dbuf.o \ + ddt.o \ + ddt_zap.o \ dmu.o \ + dmu_diff.o \ dmu_send.o \ dmu_object.o \ dmu_objset.o \ @@ -41,17 +45,18 @@ ZFS_COMMON_OBJS += \ dnode_sync.o \ dsl_dir.o \ dsl_dataset.o \ + dsl_deadlist.o \ dsl_pool.o \ dsl_synctask.o \ dmu_zfetch.o \ dsl_deleg.o \ dsl_prop.o \ - dsl_scrub.o \ - fletcher.o \ + dsl_scan.o \ gzip.o \ lzjb.o \ metaslab.o \ refcount.o \ + sa.o \ sha256.o \ spa.o \ spa_config.o \ @@ -75,20 +80,25 @@ ZFS_COMMON_OBJS += \ zap_leaf.o \ zap_micro.o \ zfs_byteswap.o \ + zfs_debug.o \ zfs_fm.o \ zfs_fuid.o \ + zfs_sa.o \ zfs_znode.o \ zil.o \ zio.o \ zio_checksum.o \ zio_compress.o \ - zio_inject.o + zio_inject.o \ + zle.o \ + zrlock.o ZFS_SHARED_OBJS += \ zfs_namecheck.o \ zfs_deleg.o \ zfs_prop.o \ zfs_comutil.o \ + zfs_fletcher.o \ zpool_prop.o \ zprop_common.o @@ -99,7 +109,9 @@ ZFS_OBJS += \ zfs_ctldir.o \ zfs_dir.o \ zfs_ioctl.o \ + zfs_ioctl_compat.o \ zfs_log.o \ + zfs_onexit.o \ zfs_replay.o \ zfs_rlock.o \ rrwlock.o \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c index 269c3eb..436918b3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c @@ -40,7 +40,6 @@ #include #include #include -#include #include @@ -108,6 +107,42 @@ * gfs_root_create_file() */ +#ifdef sun +/* + * gfs_make_opsvec: take an array of vnode type definitions and create + * their vnodeops_t structures + * + * This routine takes an array of gfs_opsvec_t's. It could + * alternatively take an array of gfs_opsvec_t*'s, which would allow + * vnode types to be completely defined in files external to the caller + * of gfs_make_opsvec(). As it stands, much more sharing takes place -- + * both the caller and the vnode type provider need to access gfsv_ops + * and gfsv_template, and the caller also needs to know gfsv_name. + */ +int +gfs_make_opsvec(gfs_opsvec_t *vec) +{ + int error, i; + + for (i = 0; ; i++) { + if (vec[i].gfsv_name == NULL) + return (0); + error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, + vec[i].gfsv_ops); + if (error) + break; + } + + cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", + vec[i].gfsv_name); + for (i--; i >= 0; i--) { + vn_freevnodeops(*vec[i].gfsv_ops); + *vec[i].gfsv_ops = NULL; + } + return (error); +} +#endif /* sun */ + /* * Low level directory routines * @@ -312,6 +347,22 @@ gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, cookies)); } +#ifdef sun +/* + * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer + * instead of a string for the entry's name. + */ +int +gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, + ino64_t ino, unsigned long num) +{ + char buf[40]; + + numtos(num, buf); + return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); +} +#endif + /* * gfs_readdir_pred: readdir loop predicate * voffp - a pointer in which the next virtual offset should be stored @@ -542,6 +593,28 @@ gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, return (vp); } +#ifdef sun +/* + * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem + * + * Similar to gfs_root_create(), this creates a root vnode for a file to + * be the pseudo-filesystem. + */ +vnode_t * +gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) +{ + vnode_t *vp = gfs_file_create(size, NULL, ops); + + ((gfs_file_t *)vp->v_data)->gfs_ino = ino; + + VFS_HOLD(vfsp); + VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); + vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; + + return (vp); +} +#endif /* sun */ + /* * gfs_file_inactive() * @@ -570,7 +643,7 @@ gfs_file_inactive(vnode_t *vp) */ if ((dp = fp->gfs_parent->v_data) == NULL) return (NULL); - + /* * First, see if this vnode is cached in the parent. */ @@ -995,6 +1068,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, return (gfs_readdir_fini(&gstate, error, eofp, eof)); } + /* * gfs_vop_lookup: VOP_LOOKUP() entry point * @@ -1062,6 +1136,81 @@ gfs_vop_readdir(ap) return (error); } + +#ifdef sun +/* + * gfs_vop_map: VOP_MAP() entry point + * + * Convenient routine for handling pseudo-files that wish to allow mmap() calls. + * This function only works for readonly files, and uses the read function for + * the vnode to fill in the data. The mapped data is immediately faulted in and + * filled with the necessary data during this call; there are no getpage() or + * putpage() routines. + */ +/* ARGSUSED */ +int +gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, + caller_context_t *ct) +{ + int rv; + ssize_t resid = len; + + /* + * Check for bad parameters + */ +#ifdef _ILP32 + if (len > MAXOFF_T) + return (ENOMEM); +#endif + if (vp->v_flag & VNOMAP) + return (ENOTSUP); + if (off > MAXOFF_T) + return (EFBIG); + if ((long)off < 0 || (long)(off + len) < 0) + return (EINVAL); + if (vp->v_type != VREG) + return (ENODEV); + if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) + return (EACCES); + + /* + * Find appropriate address if needed, otherwise clear address range. + */ + as_rangelock(as); + rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (rv != 0) { + as_rangeunlock(as); + return (rv); + } + + /* + * Create mapping + */ + rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); + as_rangeunlock(as); + if (rv != 0) + return (rv); + + /* + * Fill with data from read() + */ + rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, + 0, (rlim64_t)0, cred, &resid); + + if (rv == 0 && resid != 0) + rv = ENXIO; + + if (rv != 0) { + as_rangelock(as); + (void) as_unmap(as, *addrp, len); + as_rangeunlock(as); + } + + return (rv); +} +#endif /* sun */ + /* * gfs_vop_inactive: VOP_INACTIVE() entry point * diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c index f4e2449..83f29c1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c @@ -18,9 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -75,7 +75,6 @@ xva_getxoptattr(xvattr_t *xvap) static void vn_rele_inactive(vnode_t *vp) { - vrele(vp); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 38b39bf..2adad8a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -119,16 +118,17 @@ #include #include -#include #include #include #include #include +#include #ifdef _KERNEL #include #endif #include #include +#include #include #include @@ -178,7 +178,6 @@ static boolean_t arc_warm; uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; -int zfs_mdcomp_disable = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; @@ -186,14 +185,11 @@ int zfs_arc_p_min_shift = 0; TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); -TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); SYSCTL_DECL(_vfs_zfs); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, - &zfs_mdcomp_disable, 0, "Disable metadata compression"); /* * Note that buffers can be in one of 6 states: @@ -500,6 +496,7 @@ struct arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; + void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; @@ -560,7 +557,6 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ -#define ARC_STORED (1 << 19) /* has been store()d to */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) @@ -609,8 +605,8 @@ static buf_hash_table_t buf_hash_table; (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(buf) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; @@ -634,7 +630,7 @@ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ -boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ @@ -788,6 +784,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; + hdr->b_cksum0 = 0; +} + static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { @@ -921,7 +926,8 @@ buf_cons(void *vbuf, void *unused, int kmflag) arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); - rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -937,6 +943,7 @@ hdr_dest(void *vbuf, void *unused) { arc_buf_hdr_t *buf = vbuf; + ASSERT(BUF_EMPTY(buf)); refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); @@ -949,7 +956,8 @@ buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; - rw_destroy(&buf->b_lock); + mutex_destroy(&buf->b_evict_lock); + rw_destroy(&buf->b_data_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } @@ -1077,18 +1085,31 @@ arc_buf_thaw(arc_buf_t *buf) kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } + + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_thawed) + kmem_free(buf->b_hdr->b_thawed, 1); + buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&buf->b_hdr->b_freeze_lock); } void arc_buf_freeze(arc_buf_t *buf) { + kmutex_t *hash_lock; + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); + mutex_exit(hash_lock); } static void @@ -1111,7 +1132,6 @@ get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lo static void add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { - ASSERT(MUTEX_HELD(hash_lock)); if ((refcount_add(&ab->b_refcnt, tag) == 1) && @@ -1185,6 +1205,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(new_state != old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); from_delta = to_delta = ab->b_datacnt * ab->b_size; @@ -1207,7 +1228,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) /* * If prefetching out of the ghost cache, - * we will have a non-null datacnt. + * we will have a non-zero datacnt. */ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { /* ghost elements have a ghost size */ @@ -1245,9 +1266,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon) { + if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); - } /* adjust state sizes */ if (to_delta) @@ -1391,14 +1411,29 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_state == arc_anon); ASSERT(buf->b_data != NULL); - VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0); - VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1); + (void) refcount_add(&hdr->b_refcnt, tag); + (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + + ASSERT(buf->b_data != NULL); + hdr = buf->b_hdr; + (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_refcnt, tag); + buf->b_efunc = NULL; + buf->b_private = NULL; + + atomic_add_64(&arc_loaned_bytes, hdr->b_size); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { @@ -1406,6 +1441,8 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; + ASSERT(hdr->b_state != arc_anon); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -1430,16 +1467,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) * must verify b_data != NULL to know if the add_ref * was successful. */ - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return; } - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); - rw_exit(&buf->b_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); @@ -1487,6 +1524,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); + if (!recycle) { if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf->b_hdr, zio_buf_free, @@ -1524,6 +1562,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; + buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); @@ -1538,55 +1577,55 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!(hdr->b_flags & ARC_STORED)); + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - if (hdr->b_l2hdr != NULL) { - if (!MUTEX_HELD(&l2arc_buflist_mtx)) { - /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. - */ + if (l2hdr != NULL) { + boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + if (!buflist_held) { mutex_enter(&l2arc_buflist_mtx); - if (hdr->b_l2hdr != NULL) { - list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, - hdr); - } - mutex_exit(&l2arc_buflist_mtx); - } else { - list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + l2hdr = hdr->b_l2hdr; } - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; + + if (l2hdr != NULL) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } + + if (!buflist_held) + mutex_exit(&l2arc_buflist_mtx); } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); } while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); @@ -1596,6 +1635,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } + if (hdr->b_thawed) { + kmem_free(hdr->b_thawed, 1); + hdr->b_thawed = NULL; + } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1616,11 +1659,17 @@ arc_buf_free(arc_buf_t *buf, void *tag) kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) + if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); - else + } else { + ASSERT(buf == hdr->b_buf); + ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; + } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { int destroy_hdr; @@ -1637,12 +1686,10 @@ arc_buf_free(arc_buf_t *buf, void *tag) if (destroy_hdr) arc_hdr_destroy(hdr); } else { - if (remove_reference(hdr, NULL, tag) > 0) { - ASSERT(HDR_IO_ERROR(hdr)); + if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, FALSE, TRUE); - } else { + else arc_hdr_destroy(hdr); - } } } @@ -1654,11 +1701,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) int no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { + ASSERT(hdr->b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_state != arc_anon); ASSERT(buf->b_data != NULL); @@ -1668,6 +1718,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || @@ -1747,7 +1798,8 @@ evict_start: if (HDR_IO_IN_PROGRESS(ab) || (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) { + ddi_get_lbolt() - ab->b_arc_access < + arc_min_prefetch_lifespan)) { skipped++; continue; } @@ -1762,7 +1814,7 @@ evict_start: ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; - if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } @@ -1784,9 +1836,9 @@ evict_start: buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } @@ -1887,6 +1939,7 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t marker = { 0 }; list_t *list, *list_start; kmutex_t *hash_lock, *lock; uint64_t bytes_deleted = 0; @@ -1913,7 +1966,15 @@ evict_start: ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + hash_lock = HDR_LOCK(ab); + /* caller may be trying to modify this buffer, skip it */ + if (MUTEX_HELD(hash_lock)) + continue; if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); @@ -1936,18 +1997,21 @@ evict_start: DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; - } else { - if (bytes < 0) { - /* - * we're draining the ARC, retry - */ - mutex_exit(lock); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto evict_start; - } + } else if (bytes < 0) { + /* + * Insert a list marker and then wait for the + * hash lock to become available. Once its + * available, restart from where we left off. + */ + list_insert_after(list, ab, &marker); + mutex_exit(lock); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + mutex_enter(lock); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + } else bufs_skipped += 1; - } } mutex_exit(lock); idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); @@ -2056,9 +2120,9 @@ restart: while (tmp_arc_eviction_list != NULL) { arc_buf_t *buf = tmp_arc_eviction_list; tmp_arc_eviction_list = buf->b_next; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); if (buf->b_efunc != NULL) VERIFY(buf->b_efunc(buf) == 0); @@ -2148,11 +2212,9 @@ static int needfree = 0; static int arc_reclaim_needed(void) { -#if 0 - uint64_t extra; -#endif #ifdef _KERNEL + if (needfree) return (1); @@ -2163,7 +2225,7 @@ arc_reclaim_needed(void) if (vm_paging_needed()) return (1); -#if 0 +#ifdef sun /* * take 'desfree' extra pages, so we reclaim sooner, rather than later */ @@ -2205,10 +2267,10 @@ arc_reclaim_needed(void) (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) return (1); #endif -#else +#else /* !sun */ if (kmem_used() > (kmem_size() * 3) / 4) return (1); -#endif +#endif /* sun */ #else if (spa_get_random(100) == 0) @@ -2290,7 +2352,7 @@ arc_reclaim_thread(void *dummy __unused) } /* reset the growth delay for every reclaim */ - growtime = LBOLT + (arc_grow_retry * hz); + growtime = ddi_get_lbolt() + (arc_grow_retry * hz); if (needfree && last_reclaim == ARC_RECLAIM_CONS) { /* @@ -2304,7 +2366,7 @@ arc_reclaim_thread(void *dummy __unused) arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; - } else if (arc_no_grow && LBOLT >= growtime) { + } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { arc_no_grow = FALSE; } @@ -2411,7 +2473,7 @@ arc_evict_needed(arc_buf_contents_t type) if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); -#if 0 +#ifdef sun #ifdef _KERNEL /* * If zio data pages are being allocated out of a separate heap segment, @@ -2423,7 +2485,7 @@ arc_evict_needed(arc_buf_contents_t type) (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) return (1); #endif -#endif +#endif /* sun */ if (arc_reclaim_needed()) return (1); @@ -2543,6 +2605,8 @@ out: static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) { + clock_t now; + ASSERT(MUTEX_HELD(hash_lock)); if (buf->b_state == arc_anon) { @@ -2553,11 +2617,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) */ ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = LBOLT; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); arc_change_state(arc_mru, buf, hash_lock); } else if (buf->b_state == arc_mru) { + now = ddi_get_lbolt(); + /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read @@ -2573,7 +2639,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = LBOLT; + buf->b_arc_access = now; return; } @@ -2582,13 +2648,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (LBOLT > buf->b_arc_access + ARC_MINTIME) { + if (now > buf->b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = LBOLT; + buf->b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } @@ -2611,7 +2677,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); } - buf->b_arc_access = LBOLT; + buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); @@ -2630,7 +2696,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(list_link_active(&buf->b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = LBOLT; + buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* @@ -2648,7 +2714,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) new_state = arc_mru; } - buf->b_arc_access = LBOLT; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); @@ -2658,7 +2724,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = LBOLT; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } else { @@ -2671,7 +2737,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { - bcopy(buf->b_data, arg, buf->b_hdr->b_size); + if (zio == NULL || zio->io_error == 0) + bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg) == 1); } @@ -2685,6 +2752,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) *bufp = NULL; } else { *bufp = buf; + ASSERT(buf->b_data); } } @@ -2732,6 +2800,16 @@ arc_read_done(zio_t *zio) arc_cksum_compute(buf, B_FALSE); + if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + arc_access(hdr, hash_lock); + } + /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { @@ -2745,8 +2823,11 @@ arc_read_done(zio_t *zio) hdr->b_acb = NULL; hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) + if (abuf == buf) { + ASSERT(buf->b_efunc == NULL); + ASSERT(hdr->b_datacnt == 1); hdr->b_flags |= ARC_BUF_AVAILABLE; + } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); @@ -2767,14 +2848,6 @@ arc_read_done(zio_t *zio) cv_broadcast(&hdr->b_cv); if (hash_lock) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - if (zio->io_error == 0 && hdr->b_state == arc_anon) - arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { /* @@ -2825,27 +2898,37 @@ arc_read_done(zio_t *zio) * * Normal callers should use arc_read and pass the arc buffer and offset * for the bp. But if you know you don't need locking, you can use - * arc_read_bp. + * arc_read_nolock. */ int -arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { int err; + if (pbuf == NULL) { + /* + * XXX This happens from traverse callback funcs, for + * the objset_phys_t block. + */ + return (arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb)); + } + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_lock, RW_READER); + rw_enter(&pbuf->b_data_lock, RW_READER); err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); - rw_exit(&pbuf->b_lock); + rw_exit(&pbuf->b_data_lock); + return (err); } int -arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, +arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { @@ -2856,7 +2939,8 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, uint64_t guid = spa_guid(spa); top: - hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -2910,6 +2994,7 @@ top: } else { buf = arc_buf_clone(buf); } + } else if (*arc_flags & ARC_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { hdr->b_flags |= ARC_PREFETCH; @@ -2940,15 +3025,13 @@ top: buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = bp->blk_birth; + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } @@ -2983,12 +3066,14 @@ top: buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; - arc_get_data_buf(buf); ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; - + arc_get_data_buf(buf); + arc_access(hdr, hash_lock); } + ASSERT(!GHOST_STATE(hdr->b_state)); + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; @@ -2997,17 +3082,6 @@ top: hdr->b_acb = acb; hdr->b_flags |= ARC_IO_IN_PROGRESS; - /* - * If the buffer has been evicted, migrate it to a present state - * before issuing the I/O. Once we drop the hash-table lock, - * the header will be marked as I/O in progress and have an - * attached buffer. At this point, anybody who finds this - * buffer ought to notice that it's legit but has a pending I/O. - */ - - if (GHOST_STATE(hdr->b_state)) - arc_access(hdr, hash_lock); - if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr->b_dev->l2ad_writing; @@ -3023,8 +3097,8 @@ top: mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); - DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, - zbookmark_t *, zb); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, + uint64_t, size, zbookmark_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, @@ -3110,47 +3184,15 @@ top: return (0); } -/* - * arc_read() variant to support pool traversal. If the block is already - * in the ARC, make a copy of it; otherwise, the caller will do the I/O. - * The idea is that we don't want pool traversal filling up memory, but - * if the ARC already has the data anyway, we shouldn't pay for the I/O. - */ -int -arc_tryread(spa_t *spa, blkptr_t *bp, void *data) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_mtx; - uint64_t guid = spa_guid(spa); - int rc = 0; - - hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); - - if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { - arc_buf_t *buf = hdr->b_buf; - - ASSERT(buf); - while (buf->b_data == NULL) { - buf = buf->b_next; - ASSERT(buf); - } - bcopy(buf->b_data, data, hdr->b_size); - } else { - rc = ENOENT; - } - - if (hash_mtx) - mutex_exit(hash_mtx); - - return (rc); -} - void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); ASSERT(buf->b_hdr->b_state != arc_anon); ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_efunc == NULL); + ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); + buf->b_efunc = func; buf->b_private = private; } @@ -3169,14 +3211,14 @@ arc_buf_evict(arc_buf_t *buf) list_t *list, *evicted_list; kmutex_t *lock, *evicted_lock; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (0); } else if (buf->b_data == NULL) { arc_buf_t copy = *buf; /* structure assignment */ @@ -3185,14 +3227,15 @@ arc_buf_evict(arc_buf_t *buf) * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(copy.b_efunc(©) == 0); return (1); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(buf->b_hdr == hdr); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -3211,6 +3254,7 @@ arc_buf_evict(arc_buf_t *buf) arc_state_t *old_state = hdr->b_state; arc_state_t *evicted_state; + ASSERT(hdr->b_buf == NULL); ASSERT(refcount_is_zero(&hdr->b_refcnt)); evicted_state = @@ -3230,12 +3274,13 @@ arc_buf_evict(arc_buf_t *buf) mutex_exit(lock); } mutex_exit(hash_lock); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; buf->b_hdr = NULL; + buf->b_next = NULL; kmem_cache_free(buf_cache, buf); return (1); } @@ -3250,29 +3295,30 @@ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; - boolean_t released = B_FALSE; - rw_enter(&buf->b_lock, RW_WRITER); + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); - ASSERT(!(hdr->b_flags & ARC_STORED)); if (hdr->b_state == arc_anon) { /* this buffer is already released */ - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); - ASSERT(BUF_EMPTY(hdr)); ASSERT(buf->b_efunc == NULL); - arc_buf_thaw(buf); - rw_exit(&buf->b_lock); - released = B_TRUE; } else { hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); } l2hdr = hdr->b_l2hdr; @@ -3282,9 +3328,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_size = hdr->b_size; } - if (released) - goto out; - /* * Do we have more than one buf? */ @@ -3298,14 +3341,14 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* - * Pull the data off of this buf and attach it to - * a new anonymous buf. + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; - *bufp = (*bufp)->b_next; + *bufp = buf->b_next; buf->b_next = NULL; ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); @@ -3333,26 +3376,25 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - arc_change_state(arc_anon, hdr, hash_lock); + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; - mutex_exit(hash_lock); + if (hash_lock) + mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; -out: if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); @@ -3361,14 +3403,27 @@ out: } } +/* + * Release this buffer. If it does not match the provided BP, fill it + * with that block's contents. + */ +/* ARGSUSED */ +int +arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb) +{ + arc_release(buf, tag); + return (0); +} + int arc_released(arc_buf_t *buf) { int released; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (released); } @@ -3377,9 +3432,9 @@ arc_has_callback(arc_buf_t *buf) { int callback; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); callback = (buf->b_efunc != NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (callback); } @@ -3389,9 +3444,9 @@ arc_referenced(arc_buf_t *buf) { int referenced; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_refcnt)); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (referenced); } #endif @@ -3431,21 +3486,28 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - hdr->b_acb = NULL; + ASSERT(hdr->b_acb == NULL); + + if (zio->io_error == 0) { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + } else { + ASSERT(BUF_EMPTY(hdr)); + } - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = zio->io_bp->blk_birth; - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; /* * If the block to be written was all-zero, we may have * compressed it away. In this case no write was performed - * so there will be no dva/birth-date/checksum. The buffer - * must therefor remain anonymous (and uncached). + * so there will be no dva/birth/checksum. The buffer must + * therefore remain anonymous (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; + ASSERT(zio->io_error == 0); + arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); @@ -3455,106 +3517,54 @@ arc_write_done(zio_t *zio) * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ - ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); - ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), - BP_IDENTITY(zio->io_bp))); - ASSERT3U(zio->io_bp_orig.blk_birth, ==, - zio->io_bp->blk_birth); - - ASSERT(refcount_is_zero(&exists->b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(exists); - exists = buf_hash_insert(hdr, &hash_lock); - ASSERT3P(exists, ==, NULL); + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad overwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } else { + /* Dedup */ + ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_state == arc_anon); + ASSERT(BP_GET_DEDUP(zio->io_bp)); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + } } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ - if (hdr->b_state == arc_anon) + if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); - } else if (callback->awcb_done == NULL) { - int destroy_hdr; - /* - * This is an anonymous buffer with no user callback, - * destroy it if there are no active references. - */ - mutex_enter(&arc_eviction_mtx); - destroy_hdr = refcount_is_zero(&hdr->b_refcnt); - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - mutex_exit(&arc_eviction_mtx); - if (destroy_hdr) - arc_hdr_destroy(hdr); } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } - hdr->b_flags &= ~ARC_STORED; - if (callback->awcb_done) { - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); - callback->awcb_done(zio, buf, callback->awcb_private); - } + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } -static void -write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) -{ - boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); - - /* Determine checksum setting */ - if (ismd) { - /* - * Metadata always gets checksummed. If the data - * checksum is multi-bit correctable, and it's not a - * ZBT-style checksum, then it's suitable for metadata - * as well. Otherwise, the metadata checksum defaults - * to fletcher4. - */ - if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && - !zio_checksum_table[wp->wp_oschecksum].ci_zbt) - zp->zp_checksum = wp->wp_oschecksum; - else - zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; - } else { - zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, - wp->wp_oschecksum); - } - - /* Determine compression setting */ - if (ismd) { - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; - } else { - zp->zp_compress = zio_compress_select(wp->wp_dncompress, - wp->wp_oscompress); - } - - zp->zp_type = wp->wp_type; - zp->zp_level = wp->wp_level; - zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); -} - zio_t * -arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, - boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int zio_flags, const zbookmark_t *zb) +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; - zio_prop_t zp; ASSERT(ready != NULL); + ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == 0); + ASSERT(hdr->b_acb == NULL); if (l2arc) hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); @@ -3563,103 +3573,27 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, callback->awcb_private = private; callback->awcb_buf = buf; - write_policy(spa, wp, &zp); - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } -int -arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags) -{ - arc_buf_hdr_t *ab; - kmutex_t *hash_lock; - zio_t *zio; - uint64_t guid = spa_guid(spa); - - /* - * If this buffer is in the cache, release it, so it - * can be re-used. - */ - ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); - if (ab != NULL) { - /* - * The checksum of blocks to free is not always - * preserved (eg. on the deadlist). However, if it is - * nonzero, it should match what we have in the cache. - */ - ASSERT(bp->blk_cksum.zc_word[0] == 0 || - bp->blk_cksum.zc_word[0] == ab->b_cksum0 || - bp->blk_fill == BLK_FILL_ALREADY_FREED); - - if (ab->b_state != arc_anon) - arc_change_state(arc_anon, ab, hash_lock); - if (HDR_IO_IN_PROGRESS(ab)) { - /* - * This should only happen when we prefetch. - */ - ASSERT(ab->b_flags & ARC_PREFETCH); - ASSERT3U(ab->b_datacnt, ==, 1); - ab->b_flags |= ARC_FREED_IN_READ; - if (HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); - } else if (refcount_is_zero(&ab->b_refcnt)) { - ab->b_flags |= ARC_FREE_IN_PROGRESS; - mutex_exit(hash_lock); - arc_hdr_destroy(ab); - ARCSTAT_BUMP(arcstat_deleted); - } else { - /* - * We still have an active reference on this - * buffer. This can happen, e.g., from - * dbuf_unoverride(). - */ - ASSERT(!HDR_IN_HASH_TABLE(ab)); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); - } - } - - zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); - - if (arc_flags & ARC_WAIT) - return (zio_wait(zio)); - - ASSERT(arc_flags & ARC_NOWAIT); - zio_nowait(zio); - - return (0); -} - static int arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count - + cnt.v_cache_count); + uint64_t available_memory = + ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); static uint64_t page_load = 0; static uint64_t last_txg = 0; -#if 0 +#ifdef sun #if defined(__i386) available_memory = MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif -#endif +#endif /* sun */ if (available_memory >= zfs_write_limit_max) return (0); @@ -3776,10 +3710,12 @@ arc_lowmem(void *arg __unused, int howto __unused) /* Serialize access via arc_lowmem_lock. */ mutex_enter(&arc_lowmem_lock); + mutex_enter(&arc_reclaim_thr_lock); needfree = 1; cv_signal(&arc_reclaim_thr_cv); while (needfree) - tsleep(&needfree, 0, "zfs:lowmem", hz / 5); + msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); + mutex_exit(&arc_reclaim_thr_lock); mutex_exit(&arc_lowmem_lock); } #endif @@ -3787,8 +3723,7 @@ arc_lowmem(void *arg __unused, int howto __unused) void arc_init(void) { - int prefetch_tunable_set = 0; - int i; + int i, prefetch_tunable_set = 0; mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); @@ -3799,7 +3734,8 @@ arc_init(void) /* Start out with 1/8 of all memory */ arc_c = kmem_size() / 8; -#if 0 + +#ifdef sun #ifdef _KERNEL /* * On architectures where the physical memory can be larger @@ -3808,7 +3744,7 @@ arc_init(void) */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif -#endif +#endif /* sun */ /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<18); /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ @@ -3817,16 +3753,18 @@ arc_init(void) else arc_c_max = arc_c_min; arc_c_max = MAX(arc_c * 5, arc_c_max); + #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are * reasonable (ie. over 16MB) */ - if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size()) + if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) arc_c_max = zfs_arc_max; - if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max) + if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif + arc_c = arc_c_max; arc_p = (arc_c >> 1); @@ -3936,7 +3874,7 @@ arc_init(void) "-- to enable,\n"); printf(" add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); - zfs_prefetch_disable=1; + zfs_prefetch_disable = 1; } #else if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && @@ -3945,7 +3883,7 @@ arc_init(void) "than 4GB of RAM is present;\n" " to enable, add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); - zfs_prefetch_disable=1; + zfs_prefetch_disable = 1; } #endif /* Warn about ZFS memory and address space requirements. */ @@ -4199,7 +4137,7 @@ l2arc_write_size(l2arc_dev_t *dev) static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { - clock_t interval, next; + clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the @@ -4212,7 +4150,8 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) else interval = hz * l2arc_feed_secs; - next = MAX(LBOLT, MIN(LBOLT + interval, began + interval)); + now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, began + interval)); return (next); } @@ -4414,11 +4353,11 @@ l2arc_read_done(zio_t *zio) ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * Check this survived the L2ARC journey. @@ -4632,7 +4571,7 @@ top: } mutex_exit(&l2arc_buflist_mtx); - spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); + vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); dev->l2ad_evict = taddr; } @@ -4802,15 +4741,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz); - spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); + vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - spa_l2cache_space_update(dev->l2ad_vdev, 0, - dev->l2ad_end - dev->l2ad_hand); + vdev_space_update(dev->l2ad_vdev, + dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; @@ -4834,7 +4773,7 @@ l2arc_feed_thread(void *dummy __unused) l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; - clock_t begin, next = LBOLT; + clock_t begin, next = ddi_get_lbolt(); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -4843,9 +4782,9 @@ l2arc_feed_thread(void *dummy __unused) while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - next - LBOLT); + next - ddi_get_lbolt()); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); - next = LBOLT + hz; + next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. @@ -4856,7 +4795,7 @@ l2arc_feed_thread(void *dummy __unused) continue; } mutex_exit(&l2arc_dev_mtx); - begin = LBOLT; + begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in @@ -4875,6 +4814,16 @@ l2arc_feed_thread(void *dummy __unused) ASSERT(spa != NULL); /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { @@ -4931,7 +4880,7 @@ l2arc_vdev_present(vdev_t *vd) * validated the vdev and opened it. */ void -l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; @@ -4945,8 +4894,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) adddev->l2ad_vdev = vd; adddev->l2ad_write = l2arc_write_max; adddev->l2ad_boost = l2arc_write_boost; - adddev->l2ad_start = start; - adddev->l2ad_end = end; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; @@ -4961,7 +4910,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2node)); - spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); /* * Add device to global list diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c index 93b7741..066ccc6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c @@ -19,331 +19,51 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include -static int -bplist_hold(bplist_t *bpl) -{ - ASSERT(MUTEX_HELD(&bpl->bpl_lock)); - if (bpl->bpl_dbuf == NULL) { - int err = dmu_bonus_hold(bpl->bpl_mos, - bpl->bpl_object, bpl, &bpl->bpl_dbuf); - if (err) - return (err); - bpl->bpl_phys = bpl->bpl_dbuf->db_data; - } - return (0); -} - -uint64_t -bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) -{ - int size; - - size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ? - BPLIST_SIZE_V0 : sizeof (bplist_phys_t); - - return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, - DMU_OT_BPLIST_HDR, size, tx)); -} void -bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) +bplist_create(bplist_t *bpl) { - VERIFY(dmu_object_free(mos, object, tx) == 0); -} - -int -bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) -{ - dmu_object_info_t doi; - int err; - - err = dmu_object_info(mos, object, &doi); - if (err) - return (err); - - mutex_enter(&bpl->bpl_lock); - - ASSERT(bpl->bpl_dbuf == NULL); - ASSERT(bpl->bpl_phys == NULL); - ASSERT(bpl->bpl_cached_dbuf == NULL); - ASSERT(bpl->bpl_queue == NULL); - ASSERT(object != 0); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR); - - bpl->bpl_mos = mos; - bpl->bpl_object = object; - bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1); - bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; - bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t)); - - mutex_exit(&bpl->bpl_lock); - return (0); + mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&bpl->bpl_list, sizeof (bplist_entry_t), + offsetof(bplist_entry_t, bpe_node)); } void -bplist_close(bplist_t *bpl) -{ - mutex_enter(&bpl->bpl_lock); - - ASSERT(bpl->bpl_queue == NULL); - - if (bpl->bpl_cached_dbuf) { - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - bpl->bpl_cached_dbuf = NULL; - } - if (bpl->bpl_dbuf) { - dmu_buf_rele(bpl->bpl_dbuf, bpl); - bpl->bpl_dbuf = NULL; - bpl->bpl_phys = NULL; - } - - mutex_exit(&bpl->bpl_lock); -} - -boolean_t -bplist_empty(bplist_t *bpl) -{ - boolean_t rv; - - if (bpl->bpl_object == 0) - return (B_TRUE); - - mutex_enter(&bpl->bpl_lock); - VERIFY(0 == bplist_hold(bpl)); /* XXX */ - rv = (bpl->bpl_phys->bpl_entries == 0); - mutex_exit(&bpl->bpl_lock); - - return (rv); -} - -static int -bplist_cache(bplist_t *bpl, uint64_t blkid) -{ - int err = 0; - - if (bpl->bpl_cached_dbuf == NULL || - bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) { - if (bpl->bpl_cached_dbuf != NULL) - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - err = dmu_buf_hold(bpl->bpl_mos, - bpl->bpl_object, blkid << bpl->bpl_blockshift, - bpl, &bpl->bpl_cached_dbuf); - ASSERT(err || bpl->bpl_cached_dbuf->db_size == - 1ULL << bpl->bpl_blockshift); - } - return (err); -} - -int -bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) -{ - uint64_t blk, off; - blkptr_t *bparray; - int err; - - mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - if (*itorp >= bpl->bpl_phys->bpl_entries) { - mutex_exit(&bpl->bpl_lock); - return (ENOENT); - } - - blk = *itorp >> bpl->bpl_bpshift; - off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - bparray = bpl->bpl_cached_dbuf->db_data; - *bp = bparray[off]; - (*itorp)++; - mutex_exit(&bpl->bpl_lock); - return (0); -} - -int -bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) +bplist_destroy(bplist_t *bpl) { - uint64_t blk, off; - blkptr_t *bparray; - int err; - - ASSERT(!BP_IS_HOLE(bp)); - mutex_enter(&bpl->bpl_lock); - err = bplist_hold(bpl); - if (err) - return (err); - - blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; - off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); - bparray = bpl->bpl_cached_dbuf->db_data; - bparray[off] = *bp; - - /* We never need the fill count. */ - bparray[off].blk_fill = 0; - - /* The bplist will compress better if we can leave off the checksum */ - bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); - - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - bpl->bpl_phys->bpl_entries++; - bpl->bpl_phys->bpl_bytes += - bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp); - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); - bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); - } - mutex_exit(&bpl->bpl_lock); - - return (0); + list_destroy(&bpl->bpl_list); + mutex_destroy(&bpl->bpl_lock); } -/* - * Deferred entry; will be written later by bplist_sync(). - */ void -bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) +bplist_append(bplist_t *bpl, const blkptr_t *bp) { - bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); + bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); - ASSERT(!BP_IS_HOLE(bp)); mutex_enter(&bpl->bpl_lock); - bpq->bpq_blk = *bp; - bpq->bpq_next = bpl->bpl_queue; - bpl->bpl_queue = bpq; + bpe->bpe_blk = *bp; + list_insert_tail(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); } void -bplist_sync(bplist_t *bpl, dmu_tx_t *tx) +bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) { - bplist_q_t *bpq; + bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpq = bpl->bpl_queue) != NULL) { - bpl->bpl_queue = bpq->bpq_next; + while (bpe = list_head(&bpl->bpl_list)) { + list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); - VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx)); - kmem_free(bpq, sizeof (*bpq)); + func(arg, &bpe->bpe_blk, tx); + kmem_free(bpe, sizeof (*bpe)); mutex_enter(&bpl->bpl_lock); } mutex_exit(&bpl->bpl_lock); } - -void -bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) -{ - mutex_enter(&bpl->bpl_lock); - ASSERT3P(bpl->bpl_queue, ==, NULL); - VERIFY(0 == bplist_hold(bpl)); - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - VERIFY(0 == dmu_free_range(bpl->bpl_mos, - bpl->bpl_object, 0, -1ULL, tx)); - bpl->bpl_phys->bpl_entries = 0; - bpl->bpl_phys->bpl_bytes = 0; - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp = 0; - bpl->bpl_phys->bpl_uncomp = 0; - } - mutex_exit(&bpl->bpl_lock); -} - -int -bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - int err; - - mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - *usedp = bpl->bpl_phys->bpl_bytes; - if (bpl->bpl_havecomp) { - *compp = bpl->bpl_phys->bpl_comp; - *uncompp = bpl->bpl_phys->bpl_uncomp; - } - mutex_exit(&bpl->bpl_lock); - - if (!bpl->bpl_havecomp) { - uint64_t itor = 0, comp = 0, uncomp = 0; - blkptr_t bp; - - while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { - comp += BP_GET_PSIZE(&bp); - uncomp += BP_GET_UCSIZE(&bp); - } - if (err == ENOENT) - err = 0; - *compp = comp; - *uncompp = uncomp; - } - - return (err); -} - -/* - * Return (in *dasizep) the amount of space on the deadlist which is: - * mintxg < blk_birth <= maxtxg - */ -int -bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, - uint64_t *dasizep) -{ - uint64_t size = 0; - uint64_t itor = 0; - blkptr_t bp; - int err; - - /* - * As an optimization, if they want the whole txg range, just - * get bpl_bytes rather than iterating over the bps. - */ - if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) { - mutex_enter(&bpl->bpl_lock); - err = bplist_hold(bpl); - if (err == 0) - *dasizep = bpl->bpl_phys->bpl_bytes; - mutex_exit(&bpl->bpl_lock); - return (err); - } - - while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { - if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { - size += - bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp); - } - } - if (err == ENOENT) - err = 0; - *dasizep = size; - return (err); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c new file mode 100644 index 0000000..72be312 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c @@ -0,0 +1,495 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include + +uint64_t +bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + int size; + + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) + size = BPOBJ_SIZE_V0; + else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + size = BPOBJ_SIZE_V1; + else + size = sizeof (bpobj_phys_t); + + return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, + DMU_OT_BPOBJ_HDR, size, tx)); +} + +void +bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + int64_t i; + bpobj_t bpo; + dmu_object_info_t doi; + int epb; + dmu_buf_t *dbuf = NULL; + + VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); + + mutex_enter(&bpo.bpo_lock); + + if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) + goto out; + + VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + VERIFY3U(0, ==, dmu_buf_hold(os, + bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + bpobj_free(os, objarray[blkoff], tx); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); + +out: + mutex_exit(&bpo.bpo_lock); + bpobj_close(&bpo); + + VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); +} + +int +bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(os, object, &doi); + if (err) + return (err); + + bzero(bpo, sizeof (*bpo)); + mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); + + ASSERT(bpo->bpo_dbuf == NULL); + ASSERT(bpo->bpo_phys == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + + err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + + bpo->bpo_os = os; + bpo->bpo_object = object; + bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; + bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); + bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_phys = bpo->bpo_dbuf->db_data; + return (0); +} + +void +bpobj_close(bpobj_t *bpo) +{ + /* Lame workaround for closing a bpobj that was never opened. */ + if (bpo->bpo_object == 0) + return; + + dmu_buf_rele(bpo->bpo_dbuf, bpo); + if (bpo->bpo_cached_dbuf != NULL) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + bpo->bpo_dbuf = NULL; + bpo->bpo_phys = NULL; + bpo->bpo_cached_dbuf = NULL; + bpo->bpo_object = 0; + + mutex_destroy(&bpo->bpo_lock); +} + +static int +bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, + boolean_t free) +{ + dmu_object_info_t doi; + int epb; + int64_t i; + int err = 0; + dmu_buf_t *dbuf = NULL; + + mutex_enter(&bpo->bpo_lock); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + blkptr_t *bparray; + blkptr_t *bp; + uint64_t offset, blkoff; + + offset = i * sizeof (blkptr_t); + blkoff = P2PHASE(i, bpo->bpo_epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, + FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + bparray = dbuf->db_data; + bp = &bparray[blkoff]; + err = func(arg, bp, tx); + if (err) + break; + if (free) { + bpo->bpo_phys->bpo_bytes -= + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); + } + bpo->bpo_phys->bpo_num_blkptrs--; + ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + i++; + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, + i * sizeof (blkptr_t), -1ULL, tx)); + } + if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) + goto out; + + ASSERT(bpo->bpo_havecomp); + err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); + if (err) { + mutex_exit(&bpo->bpo_lock); + return (err); + } + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + bpobj_t sublist; + uint64_t used_before, comp_before, uncomp_before; + uint64_t used_after, comp_after, uncomp_after; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); + if (err) + break; + if (free) { + err = bpobj_space(&sublist, + &used_before, &comp_before, &uncomp_before); + if (err) + break; + } + err = bpobj_iterate_impl(&sublist, func, arg, tx, free); + if (free) { + VERIFY3U(0, ==, bpobj_space(&sublist, + &used_after, &comp_after, &uncomp_after)); + bpo->bpo_phys->bpo_bytes -= used_before - used_after; + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + bpo->bpo_phys->bpo_comp -= comp_before - comp_after; + bpo->bpo_phys->bpo_uncomp -= + uncomp_before - uncomp_after; + } + + bpobj_close(&sublist); + if (err) + break; + if (free) { + err = dmu_object_free(bpo->bpo_os, + objarray[blkoff], tx); + if (err) + break; + bpo->bpo_phys->bpo_num_subobjs--; + ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + (i + 1) * sizeof (uint64_t), -1ULL, tx)); + } + +out: + /* If there are no entries, there should be no bytes. */ + ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || + (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || + bpo->bpo_phys->bpo_bytes == 0); + + mutex_exit(&bpo->bpo_lock); + return (err); +} + +/* + * Iterate and remove the entries. If func returns nonzero, iteration + * will stop and that entry will not be removed. + */ +int +bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); +} + +/* + * Iterate the entries. If func returns nonzero, iteration will stop. + */ +int +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); +} + +void +bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) +{ + bpobj_t subbpo; + uint64_t used, comp, uncomp, subsubobjs; + + ASSERT(bpo->bpo_havesubobj); + ASSERT(bpo->bpo_havecomp); + + VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); + + if (used == 0) { + /* No point in having an empty subobj. */ + bpobj_close(&subbpo); + bpobj_free(bpo->bpo_os, subobj, tx); + return; + } + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + } + + mutex_enter(&bpo->bpo_lock); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + + /* + * If subobj has only one block of subobjs, then move subobj's + * subobjs to bpo's subobj list directly. This reduces + * recursion in bpobj_iterate due to nested subobjs. + */ + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + if (subsubobjs != 0) { + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset == doi.doi_data_block_size) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, + subsubobjs, tx)); + } + } + bpo->bpo_phys->bpo_bytes += used; + bpo->bpo_phys->bpo_comp += comp; + bpo->bpo_phys->bpo_uncomp += uncomp; + mutex_exit(&bpo->bpo_lock); + + bpobj_close(&subbpo); +} + +void +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t stored_bp = *bp; + uint64_t offset; + int blkoff; + blkptr_t *bparray; + + ASSERT(!BP_IS_HOLE(bp)); + + /* We never need the fill count. */ + stored_bp.blk_fill = 0; + + /* The bpobj will compress better if we can leave off the checksum */ + if (!BP_GET_DEDUP(bp)) + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + + mutex_enter(&bpo->bpo_lock); + + offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); + blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); + + if (bpo->bpo_cached_dbuf == NULL || + offset < bpo->bpo_cached_dbuf->db_offset || + offset >= bpo->bpo_cached_dbuf->db_offset + + bpo->bpo_cached_dbuf->db_size) { + if (bpo->bpo_cached_dbuf) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, bpo, &bpo->bpo_cached_dbuf, 0)); + } + + dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); + bparray = bpo->bpo_cached_dbuf->db_data; + bparray[blkoff] = stored_bp; + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + bpo->bpo_phys->bpo_num_blkptrs++; + bpo->bpo_phys->bpo_bytes += + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpo->bpo_lock); +} + +struct space_range_arg { + spa_t *spa; + uint64_t mintxg; + uint64_t maxtxg; + uint64_t used; + uint64_t comp; + uint64_t uncomp; +}; + +/* ARGSUSED */ +static int +space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct space_range_arg *sra = arg; + + if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + sra->used += bp_get_dsize_sync(sra->spa, bp); + sra->comp += BP_GET_PSIZE(bp); + sra->uncomp += BP_GET_UCSIZE(bp); + } + return (0); +} + +int +bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + mutex_enter(&bpo->bpo_lock); + + *usedp = bpo->bpo_phys->bpo_bytes; + if (bpo->bpo_havecomp) { + *compp = bpo->bpo_phys->bpo_comp; + *uncompp = bpo->bpo_phys->bpo_uncomp; + mutex_exit(&bpo->bpo_lock); + return (0); + } else { + mutex_exit(&bpo->bpo_lock); + return (bpobj_space_range(bpo, 0, UINT64_MAX, + usedp, compp, uncompp)); + } +} + +/* + * Return the amount of space in the bpobj which is: + * mintxg < blk_birth <= maxtxg + */ +int +bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + struct space_range_arg sra = { 0 }; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpo_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) + return (bpobj_space(bpo, usedp, compp, uncompp)); + + sra.spa = dmu_objset_spa(bpo->bpo_os); + sra.mintxg = mintxg; + sra.maxtxg = maxtxg; + + err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); + *usedp = sra.used; + *compp = sra.comp; + *uncompp = sra.uncomp; + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index cf983e2..f6b2d99 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -34,12 +33,12 @@ #include #include #include +#include +#include static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); -static arc_done_func_t dbuf_write_ready; -static arc_done_func_t dbuf_write_done; /* * Global data structures and functions for the dbuf cache. @@ -107,7 +106,7 @@ dmu_buf_impl_t * dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; uint64_t obj = dn->dn_object; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; @@ -138,7 +137,7 @@ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = db->db_objset; + objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; @@ -218,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db) db->db_evict_func = NULL; } +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) +{ + if (db->db_level > 0) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + void dbuf_evict(dmu_buf_impl_t *db) { @@ -282,7 +297,8 @@ dbuf_fini(void) static void dbuf_verify(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; + dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db) return; ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); @@ -297,24 +315,35 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DB_BONUS_BLKID || - list_head(&dn->dn_dbufs)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !list_is_empty(&dn->dn_dbufs)); } - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); + ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); + } else if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, 0); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } + for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { - dbuf_dirty_record_t *dr = db->db_data_pending; + dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. @@ -331,8 +360,9 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; @@ -344,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db) * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -352,7 +382,8 @@ dbuf_verify(dmu_buf_impl_t *db) } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && - db->db.db_data && db->db_blkid != DB_BONUS_BLKID && + (db->db_buf == NULL || db->db_buf->b_data) && + db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, @@ -368,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db) } } } + DB_DNODE_EXIT(db); } #endif @@ -396,8 +428,35 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) } else { dbuf_evict_user(db); db->db.db_data = NULL; - db->db_state = DB_UNCACHED; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; + } +} + +/* + * Loan out an arc_buf for read. Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ + arc_buf_t *abuf; + + mutex_enter(&db->db_mtx); + if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { + int blksz = db->db.db_size; + spa_t *spa; + + mutex_exit(&db->db_mtx); + DB_GET_SPA(&spa, db); + abuf = arc_loan_buf(spa, blksz); + bcopy(db->db.db_data, abuf->b_data, blksz); + } else { + abuf = db->db_buf; + arc_loan_inuse_buf(abuf, db); + dbuf_set_data(db, NULL); + mutex_exit(&db->db_mtx); } + return (abuf); } uint64_t @@ -436,24 +495,26 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - dbuf_rele(db, NULL); + dbuf_rele_and_unlock(db, NULL); } static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; + spa_t *spa; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; arc_buf_t *pbuf; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -461,7 +522,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); @@ -471,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + DB_DNODE_EXIT(db); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); @@ -489,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); + DB_DNODE_EXIT(db); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; @@ -496,17 +559,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) return; } + spa = dn->dn_objset->os_spa; + DB_DNODE_EXIT(db); + db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_L2CACHE; - zb.zb_objset = db->db_objset->os_dsl_dataset ? - db->db_objset->os_dsl_dataset->ds_object : 0; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; + SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ @@ -516,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) else pbuf = db->db_objset->os_phys_buf; - (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, + (void) dsl_read(zio, spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -530,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) int err = 0; int havepzio = (zio != NULL); int prefetch; + dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it @@ -537,46 +602,54 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ ASSERT(!refcount_is_zero(&db->db_holds)); + if (db->db_state == DB_NOFILL) + return (EIO); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + rw_enter(&dn->dn_struct_rwlock, RW_READER); - prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { - if (zio == NULL) { - zio = zio_root(db->db_dnode->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - } + spa_t *spa = dn->dn_objset->os_spa; + + if (zio == NULL) + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_read_impl(db, zio, &flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, flags & DB_RF_CACHED); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { @@ -600,18 +673,21 @@ static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); + DB_GET_SPA(&spa, db); + dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; + } else if (db->db_state == DB_NOFILL) { + dbuf_set_data(db, NULL); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } @@ -643,18 +719,18 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (dr == NULL || (dr->dt.dl.dr_data != - ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, + * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); @@ -662,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dr->dt.dl.dr_data = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db, type); + spa_t *spa; + + DB_GET_SPA(&spa, db); + dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { dbuf_set_data(db, NULL); @@ -674,22 +752,25 @@ void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); - if (db->db_blkid == DB_BONUS_BLKID || + if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; + ASSERT(db->db_data_pending != dr); + /* free this block */ - if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { - /* XXX can get silent EIO here */ - (void) dsl_free(NULL, - spa_get_dsl(db->db_dnode->dn_objset->os_spa), - txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); + if (!BP_IS_HOLE(bp)) { + spa_t *spa; + + DB_GET_SPA(&spa, db); + zio_free(spa, txg, bp); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; /* @@ -719,7 +800,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) uint64_t first_l1 = start >> epbs; uint64_t last_l1 = end >> epbs; - if (end > dn->dn_maxblkid) { + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { end = dn->dn_maxblkid; last_l1 = end >> epbs; } @@ -727,7 +808,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level == 1 && db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { @@ -755,6 +836,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) mutex_enter(&db->db_mtx); if (db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); @@ -782,7 +864,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) * size to reflect that this buffer may * contain new data when we sync. */ - if (db->db_blkid > dn->dn_maxblkid) + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { @@ -825,10 +908,15 @@ dbuf_block_freeable(dmu_buf_impl_t *db) else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; - /* If we don't exist or are in a snapshot, we can't be freed */ + /* + * If we don't exist or are in a snapshot, we can't be freed. + * Don't pass the bp to dsl_dataset_block_freeable() since we + * are holding the db_mtx lock and might deadlock if we are + * prefetching a dedup-ed block. + */ if (birth_txg) return (ds == NULL || - dsl_dataset_block_freeable(ds, birth_txg)); + dsl_dataset_block_freeable(ds, NULL, birth_txg)); else return (FALSE); } @@ -839,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dbuf_will_dirty() with the dn_struct_rwlock held @@ -858,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); + buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -878,14 +970,36 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) } mutex_exit(&db->db_mtx); - dnode_willuse_space(db->db_dnode, size-osize, tx); + dnode_willuse_space(dn, size-osize, tx); + DB_DNODE_EXIT(db); +} + +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os; + zbookmark_t zb; + + DB_GET_OBJSET(&os, db); + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + zb.zb_objset = os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + (void) arc_release_bp(db->db_buf, db, + db->db_blkptr, os->os_spa, &zb); } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; @@ -895,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they @@ -920,7 +1036,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || - db->db_state == DB_CACHED || db->db_state == DB_FILL); + db->db_state == DB_CACHED || db->db_state == DB_FILL || + db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* @@ -936,6 +1053,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } mutex_exit(&dn->dn_mtx); + if (db->db_blkid == DMU_SPILL_BLKID) + dn->dn_have_spill = B_TRUE; + /* * If this buffer is already dirty, we're done. */ @@ -945,13 +1065,16 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + DB_DNODE_EXIT(db); + + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT) + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); @@ -979,18 +1102,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ + os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop * the db_mtx. This keeps us from grabbing other locks - * (and possibly deadlocking) in bp_get_dasize() while + * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); @@ -1006,22 +1130,26 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_level == 0) { void *data_old = db->db_buf; - if (db->db_blkid == DB_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; - } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { - /* - * Release the data buffer from the cache so that we - * can modify it without impacting possible other users - * of this cached data block. Note that indirect - * blocks and private objects are not released until the - * syncing state (since they are only modified then). - */ - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db_buf; + if (db->db_state != DB_NOFILL) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so + * that we can modify it without impacting + * possible other users of this cached data + * block. Note that indirect blocks and + * private objects are not released until the + * syncing state (since they are only modified + * then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); } - ASSERT(data_old != NULL); dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -1039,7 +1167,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); mutex_exit(&dn->dn_mtx); @@ -1055,17 +1184,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&db->db_mtx); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? - bp_get_dasize(os->os_spa, bp) : db->db.db_size; + bp_get_dsize(os->os_spa, bp) : db->db.db_size; /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much @@ -1074,6 +1205,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ + ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } @@ -1097,6 +1229,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); + ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) @@ -1121,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || - db->db_parent == db->db_dnode->dn_dbuf); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); @@ -1132,21 +1264,21 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); } static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); - /* * If this buffer is not dirty, we're done. */ @@ -1158,6 +1290,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (0); } ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); /* * If this buffer is currently held, we cannot undirty @@ -1171,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); mutex_exit(&dn->dn_mtx); + DB_DNODE_EXIT(db); return (0); } @@ -1192,14 +1329,18 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } + DB_DNODE_EXIT(db); if (db->db_level == 0) { - dbuf_unoverride(dr); + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + } } else { ASSERT(db->db_buf != NULL); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -1214,7 +1355,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; - ASSERT(arc_released(buf)); + ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); dbuf_evict(db); @@ -1234,18 +1375,30 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void +dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_state = DB_NOFILL; + + dmu_buf_will_fill(db_fake, tx); +} + +void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); @@ -1267,7 +1420,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); @@ -1287,8 +1440,7 @@ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ASSERT(buf != NULL); @@ -1311,9 +1463,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db) == 1); + xuio_stat_wbuf_copied(); return; } + xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; @@ -1349,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: * DMU: dbuf_clear()->arc_buf_evict() * ARC: dbuf_do_evict()->dbuf_destroy() @@ -1357,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) void dbuf_clear(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; - dmu_buf_impl_t *dndb = dn->dn_dbuf; + dmu_buf_impl_t *dndb; int dbuf_gone = FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1369,7 +1523,7 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } @@ -1377,16 +1531,32 @@ dbuf_clear(dmu_buf_impl_t *db) db->db_state = DB_UNCACHED; } - ASSERT3U(db->db_state, ==, DB_UNCACHED); + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; - if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; + if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); + membar_producer(); + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ dnode_rele(dn, db); - db->db_dnode = NULL; + db->db_dnode_handle = NULL; + } else { + DB_DNODE_EXIT(db); } if (db->db_buf) @@ -1396,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); /* - * If this dbuf is referened from an indirect dbuf, + * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) @@ -1412,7 +1582,20 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; *bpp = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); + + if (blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_have_spill && + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + *bpp = &dn->dn_phys->dn_spill; + else + *bpp = NULL; + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + mutex_exit(&dn->dn_mtx); + return (0); + } if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; @@ -1461,7 +1644,7 @@ static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -1475,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; - db->db_dnode = dn; + db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; @@ -1485,16 +1668,20 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_immediate_evict = 0; db->db_freed_in_flight = 0; - if (blkid == DB_BONUS_BLKID) { + if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - db->db.db_offset = DB_BONUS_BLKID; + db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); + } else if (blkid == DMU_SPILL_BLKID) { + db->db.db_size = (blkptr != NULL) ? + BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; + db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; @@ -1528,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); @@ -1562,20 +1750,29 @@ dbuf_destroy(dmu_buf_impl_t *db) { ASSERT(refcount_is_zero(&db->db_holds)); - if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != DMU_BONUS_BLKID) { /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ - if (db->db_dnode) { - dnode_t *dn = db->db_dnode; + if (db->db_dnode_handle != NULL) { + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); - + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold + * corresponding to the removed dbuf is no longer + * discounted in dnode_move(), so the dnode cannot be + * moved until after we release the hold. + */ dnode_rele(dn, db); - db->db_dnode = NULL; + db->db_dnode_handle = NULL; } dbuf_hash_remove(db); } @@ -1598,7 +1795,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (dnode_block_freed(dn, blkid)) @@ -1606,37 +1803,34 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) /* dbuf_find() returns with db_mtx held */ if (db = dbuf_find(dn, 0, blkid)) { - if (refcount_count(&db->db_holds) > 0) { - /* - * This dbuf is active. We assume that it is - * already CACHED, or else about to be either - * read or filled. - */ - mutex_exit(&db->db_mtx); - return; - } + /* + * This dbuf is already in the cache. We assume that + * it is already CACHED, or else about to be either + * read or filled. + */ mutex_exit(&db->db_mtx); - db = NULL; + return; } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { + int priority = dn->dn_type == DMU_OT_DDT_ZAP ? + ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; arc_buf_t *pbuf; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; - zb.zb_objset = dn->dn_objset->os_dsl_dataset ? - dn->dn_objset->os_dsl_dataset->ds_object : 0; - zb.zb_object = dn->dn_object; - zb.zb_level = 0; - zb.zb_blkid = blkid; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, 0, blkid); if (db) pbuf = db->db_buf; else pbuf = dn->dn_objset->os_phys_buf; - (void) arc_read(NULL, dn->dn_objset->os_spa, - bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + (void) dsl_read(NULL, dn->dn_objset->os_spa, + bp, pbuf, NULL, NULL, priority, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } @@ -1655,7 +1849,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, { dmu_buf_impl_t *db, *parent = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); @@ -1704,7 +1898,7 @@ top: * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; @@ -1713,7 +1907,7 @@ top: arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); @@ -1729,7 +1923,7 @@ top: if (parent) dbuf_rele(parent, NULL); - ASSERT3P(db->db_dnode, ==, dn); + ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; @@ -1759,7 +1953,38 @@ dbuf_create_bonus(dnode_t *dn) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); +} + +int +dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + if (db->db_blkid != DMU_SPILL_BLKID) + return (ENOTSUP); + if (blksz == 0) + blksz = SPA_MINBLOCKSIZE; + if (blksz > SPA_MAXBLOCKSIZE) + blksz = SPA_MAXBLOCKSIZE; + else + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dbuf_new_size(db, blksz, tx); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + return (0); +} + +void +dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref @@ -1770,15 +1995,38 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag) ASSERT(holds > 1); } +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ #pragma weak dmu_buf_rele = dbuf_rele void dbuf_rele(dmu_buf_impl_t *db, void *tag) { + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, tag); +} + +/* + * dbuf_rele() for an already-locked dbuf. This is necessary to allow + * db_dirtycnt and db_holds to be updated atomically. + */ +void +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) +{ int64_t holds; - mutex_enter(&db->db_mtx); + ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); @@ -1794,15 +2042,29 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_evict_user(db); if (holds == 0) { - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { mutex_exit(&db->db_mtx); - dnode_rele(db->db_dnode, db); + + /* + * If the dnode moves here, we cannot cross this barrier + * until the move completes. + */ + DB_DNODE_ENTER(db); + (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); + DB_DNODE_EXIT(db); + /* + * The bonus buffer's dnode hold is no longer discounted + * in dnode_move(). The dnode cannot move until after + * the dnode_rele(). + */ + dnode_rele(DB_DNODE(db), db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ - ASSERT3U(db->db_state, ==, DB_UNCACHED); + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); dbuf_evict(db); } else if (arc_released(db->db_buf)) { arc_buf_t *buf = db->db_buf; @@ -1892,7 +2154,7 @@ dmu_buf_freeable(dmu_buf_t *dbuf) if (db->db_blkptr) res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, - db->db_blkptr->blk_birth); + db->db_blkptr, db->db_blkptr->blk_birth); return (res); } @@ -1906,6 +2168,11 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) if (db->db_blkptr != NULL) return; + if (db->db_blkid == DMU_SPILL_BLKID) { + db->db_blkptr = &dn->dn_phys->dn_spill; + BP_ZERO(db->db_blkptr); + return; + } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was @@ -1941,7 +2208,7 @@ static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; + dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); @@ -1959,10 +2226,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); ASSERT(db->db_buf != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); db->db_data_pending = dr; @@ -1982,8 +2252,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); @@ -2002,23 +2272,34 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { - ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); @@ -2028,6 +2309,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); + ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); @@ -2036,11 +2318,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } + os = dn->dn_objset; + /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we @@ -2050,7 +2333,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_check_blkptr(dn, db); /* - * If this buffer is in the middle of an immdiate write, + * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { @@ -2059,43 +2342,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } - /* - * If this dbuf has already been written out via an immediate write, - * just complete the write by copying over the new block pointer and - * updating the accounting via the write-completion functions. - */ - if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - zio_t zio_fake; - - zio_fake.io_private = &db; - zio_fake.io_error = 0; - zio_fake.io_bp = db->db_blkptr; - zio_fake.io_bp_orig = *db->db_blkptr; - zio_fake.io_txg = txg; - zio_fake.io_flags = 0; - - *db->db_blkptr = dr->dt.dl.dr_overridden_by; - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - db->db_data_pending = dr; - dr->dr_zio = &zio_fake; - mutex_exit(&db->db_mtx); - - ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp), - BP_IDENTITY(&zio_fake.io_bp_orig)) || - BP_IS_HOLE(zio_fake.io_bp)); - - if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - &zio_fake.io_bp_orig, dn->dn_zio, tx); - - dbuf_write_ready(&zio_fake, db->db_buf, db); - dbuf_write_done(&zio_fake, db->db_buf, db); - - return; - } - - if (dn->dn_object != DMU_META_DNODE_OBJECT && + if (db->db_state != DB_NOFILL && + dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && + dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there @@ -2113,8 +2363,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } - - ASSERT(*datap != NULL); db->db_data_pending = dr; mutex_exit(&db->db_mtx); @@ -2122,10 +2370,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); - if (dn->dn_object == DMU_META_DNODE_OBJECT) + if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - else + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); + } } void @@ -2154,111 +2412,53 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx) } } -static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; - dmu_buf_impl_t *parent = db->db_parent; - uint64_t txg = tx->tx_txg; - zbookmark_t zb; - writeprops_t wp = { 0 }; - zio_t *zio; - - if (!BP_IS_HOLE(db->db_blkptr) && - (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - arc_release(data, db); - } else { - ASSERT(arc_released(data)); - /* XXX why do we need to thaw here? */ - arc_buf_thaw(data); - } - - if (parent != dn->dn_dbuf) { - ASSERT(parent && parent->db_data_pending); - ASSERT(db->db_level == parent->db_level-1); - ASSERT(arc_released(parent->db_buf)); - zio = parent->db_data_pending->dr_zio; - } else { - ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - zio = dn->dn_zio; - } - - ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(db->db_blkptr->blk_birth, <=, txg); - ASSERT(zio); - - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; - - wp.wp_type = dn->dn_type; - wp.wp_level = db->db_level; - wp.wp_copies = os->os_copies; - wp.wp_dncompress = dn->dn_compress; - wp.wp_oscompress = os->os_compress; - wp.wp_dnchecksum = dn->dn_checksum; - wp.wp_oschecksum = os->os_checksum; - - if (BP_IS_OLDER(db->db_blkptr, txg)) - (void) dsl_dataset_block_kill( - os->os_dsl_dataset, db->db_blkptr, zio, tx); - - dr->dr_zio = arc_write(zio, os->os_spa, &wp, - DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, - data, dbuf_write_ready, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); -} - /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; + dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; + spa_t *spa = zio->io_spa; + int64_t delta; uint64_t fill = 0; - int old_size, new_size, i; + int i; ASSERT(db->db_blkptr == bp); - dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); - - old_size = bp_get_dasize(os->os_spa, bp_orig); - new_size = bp_get_dasize(os->os_spa, bp); - - dnode_diduse_space(dn, new_size - old_size); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta - zio->io_prev_space_delta); + zio->io_prev_space_delta = delta; if (BP_IS_HOLE(bp)) { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - - if (bp_orig->blk_birth == tx->tx_txg) - (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); - ASSERT3U(bp->blk_fill, ==, 0); + ASSERT(bp->blk_fill == 0); + DB_DNODE_EXIT(db); return; } - ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); mutex_enter(&db->db_mtx); +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + } +#endif + if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid) + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); @@ -2281,21 +2481,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) fill += ibp->blk_fill; } } + DB_DNODE_EXIT(db); bp->blk_fill = fill; mutex_exit(&db->db_mtx); - - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); - } else { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - - if (bp_orig->blk_birth == tx->tx_txg) - (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); - dsl_dataset_block_born(ds, bp, tx); - } } /* ARGSUSED */ @@ -2303,34 +2493,70 @@ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t txg = zio->io_txg; dbuf_dirty_record_t **drp, *dr; ASSERT3U(zio->io_error, ==, 0); + ASSERT(db->db_blkptr == bp); + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + objset_t *os; + dsl_dataset_t *ds; + dmu_tx_t *tx; + + DB_GET_OBJSET(&os, db); + ds = os->os_dsl_dataset; + tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); + } +#endif + if (db->db_level == 0) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - - if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); - else if (!BP_IS_HOLE(db->db_blkptr)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); - else - ASSERT(arc_released(db->db_buf)); + if (db->db_state != DB_NOFILL) { + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + else if (!arc_released(db->db_buf)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } } else { - dnode_t *dn = db->db_dnode; + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { @@ -2342,6 +2568,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } + DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } @@ -2351,9 +2578,134 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); +} + +static void +dbuf_write_nofill_ready(zio_t *zio) +{ + dbuf_write_ready(zio, NULL, zio->io_private); +} + +static void +dbuf_write_nofill_done(zio_t *zio) +{ + dbuf_write_done(zio, NULL, zio->io_private); +} + +static void +dbuf_write_override_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + + dbuf_write_ready(zio, NULL, db); +} + +static void +dbuf_write_override_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *obp = &dr->dt.dl.dr_overridden_by; + + mutex_enter(&db->db_mtx); + if (!BP_EQUAL(zio->io_bp, obp)) { + if (!BP_IS_HOLE(obp)) + dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); + arc_release(dr->dt.dl.dr_data, db); + } mutex_exit(&db->db_mtx); - dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); + dbuf_write_done(zio, NULL, db); +} - dbuf_rele(db, (void *)(uintptr_t)txg); +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_t zb; + zio_prop_t zp; + zio_t *zio; + int wp_flag = 0; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; + + if (db->db_state != DB_NOFILL) { + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) { + arc_buf_thaw(data); + } else { + dbuf_release_bp(db); + } + } + } + + if (parent != dn->dn_dbuf) { + ASSERT(parent && parent->db_data_pending); + ASSERT(db->db_level == parent->db_level-1); + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; + } else { + ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid != DMU_SPILL_BLKID) || + (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + zio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + if (db->db_blkid == DMU_SPILL_BLKID) + wp_flag = WP_SPILL; + wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + + dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); + + if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + ASSERT(db->db_state != DB_NOFILL); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, data->b_data, arc_buf_size(data), &zp, + dbuf_write_override_ready, dbuf_write_override_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + mutex_enter(&db->db_mtx); + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + dr->dt.dl.dr_copies); + mutex_exit(&db->db_mtx); + } else if (db->db_state == DB_NOFILL) { + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, NULL, db->db.db_size, &zp, + dbuf_write_nofill_ready, dbuf_write_nofill_done, db, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); + } else { + ASSERT(arc_released(data)); + dr->dr_zio = arc_write(zio, os->os_spa, txg, + db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, + dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c new file mode 100644 index 0000000..0edf62e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c @@ -0,0 +1,1152 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Enable/disable prefetching of dedup-ed blocks which are going to be freed. + */ +int zfs_dedup_prefetch = 1; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP"); +TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch); +SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch, + 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); + +static const ddt_ops_t *ddt_ops[DDT_TYPES] = { + &ddt_zap_ops, +}; + +static const char *ddt_class_name[DDT_CLASSES] = { + "ditto", + "duplicate", + "unique", +}; + +static void +ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp == 0); + VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); + ASSERT(*objectp != 0); + + VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx) == 0); + + VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); +} + +static void +ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp != 0); + ASSERT(ddt_object_count(ddt, type, class) == 0); + ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); + VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); + VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); + VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); + + *objectp = 0; +} + +static int +ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + int error; + + ddt_object_name(ddt, type, class, name); + + error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); + + if (error) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class]); + + /* + * Seed the cached statistics. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ASSERT(error == 0); + return (error); +} + +static void +ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); + + /* + * Cache DDT statistics; this is the only time they'll change. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; +} + +static int +ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, + ddt->ddt_object[type][class], dde)); +} + +static void +ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, + ddt->ddt_object[type][class], dde); +} + +int +ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +static int +ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +int +ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *walk, ddt_entry_t *dde) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], dde, walk)); +} + +uint64_t +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, + ddt->ddt_object[type][class])); +} + +int +ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_object_info_t *doi) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], + doi)); +} + +boolean_t +ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + return (!!ddt->ddt_object[type][class]); +} + +void +ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + char *name) +{ + (void) sprintf(name, DMU_POOL_DDT, + zio_checksum_table[ddt->ddt_checksum].ci_name, + ddt_ops[type]->ddt_op_name, ddt_class_name[class]); +} + +void +ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +{ + ASSERT(txg != 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + bp->blk_dva[d] = ddp->ddp_dva[d]; + BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); +} + +void +ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +{ + BP_ZERO(bp); + + if (ddp != NULL) + ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + + bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; + + BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); + BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); + BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CHECKSUM(bp, checksum); + BP_SET_TYPE(bp, DMU_OT_DEDUP); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); +} + +void +ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) +{ + ddk->ddk_cksum = bp->blk_cksum; + ddk->ddk_prop = 0; + + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); +} + +void +ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +{ + ASSERT(ddp->ddp_phys_birth == 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + ddp->ddp_dva[d] = bp->blk_dva[d]; + ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); +} + +void +ddt_phys_clear(ddt_phys_t *ddp) +{ + bzero(ddp, sizeof (*ddp)); +} + +void +ddt_phys_addref(ddt_phys_t *ddp) +{ + ddp->ddp_refcnt++; +} + +void +ddt_phys_decref(ddt_phys_t *ddp) +{ + ASSERT((int64_t)ddp->ddp_refcnt > 0); + ddp->ddp_refcnt--; +} + +void +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +{ + blkptr_t blk; + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_phys_clear(ddp); + zio_free(ddt->ddt_spa, txg, &blk); +} + +ddt_phys_t * +ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +{ + ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && + BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + return (ddp); + } + return (NULL); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_entry_t *dde) +{ + uint64_t refcnt = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys[p].ddp_refcnt; + + return (refcnt); +} + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + bzero(dds, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + while (d < d_end) + *d++ += (*s++ ^ neg) - neg; +} + +static void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit(dds.dds_ref_blocks) - 1; + ASSERT(bucket >= 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + bzero(dds, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) +{ + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + return (dds_total.dds_ref_dsize - dds_total.dds_dsize); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} + +int +ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) +{ + spa_t *spa = ddt->ddt_spa; + uint64_t total_refcnt = 0; + uint64_t ditto = spa->spa_dedup_ditto; + int total_copies = 0; + int desired_copies = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *zio = dde->dde_lead_zio[p]; + uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ + if (zio != NULL) + refcnt += zio->io_parent_count; /* pending refs */ + if (ddp == ddp_willref) + refcnt++; /* caller's ref */ + if (refcnt != 0) { + total_refcnt += refcnt; + total_copies += p; + } + } + + if (ditto == 0 || ditto > UINT32_MAX) + ditto = UINT32_MAX; + + if (total_refcnt >= 1) + desired_copies++; + if (total_refcnt >= ditto) + desired_copies++; + if (total_refcnt >= ditto * ditto) + desired_copies++; + + return (MAX(desired_copies, total_copies) - total_copies); +} + +int +ddt_ditto_copies_present(ddt_entry_t *dde) +{ + ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; + dva_t *dva = ddp->ddp_dva; + int copies = 0 - DVA_GET_GANG(dva); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) + if (DVA_IS_VALID(dva)) + copies++; + + ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); + + return (copies); +} + +size_t +ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + bcopy(src, dst, s_len); + } + + *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + + return (c_len + 1); +} + +void +ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + bcopy(src, dst, d_len); + + if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + byteswap_uint64_array(dst, d_len); +} + +ddt_t * +ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) +{ + return (spa->spa_ddt[c]); +} + +ddt_t * +ddt_select(spa_t *spa, const blkptr_t *bp) +{ + return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); +} + +void +ddt_enter(ddt_t *ddt) +{ + mutex_enter(&ddt->ddt_lock); +} + +void +ddt_exit(ddt_t *ddt) +{ + mutex_exit(&ddt->ddt_lock); +} + +static ddt_entry_t * +ddt_alloc(const ddt_key_t *ddk) +{ + ddt_entry_t *dde; + + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); + + dde->dde_key = *ddk; + + return (dde); +} + +static void +ddt_free(ddt_entry_t *dde) +{ + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++) + ASSERT(dde->dde_lead_zio[p] == NULL); + + if (dde->dde_repair_data != NULL) + zio_buf_free(dde->dde_repair_data, + DDK_GET_PSIZE(&dde->dde_key)); + + cv_destroy(&dde->dde_cv); + kmem_free(dde, sizeof (*dde)); +} + +void +ddt_remove(ddt_t *ddt, ddt_entry_t *dde) +{ + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); +} + +ddt_entry_t * +ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +{ + ddt_entry_t *dde, dde_search; + enum ddt_type type; + enum ddt_class class; + avl_index_t where; + int error; + + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + ddt_key_fill(&dde_search.dde_key, bp); + + dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + if (dde == NULL) { + if (!add) + return (NULL); + dde = ddt_alloc(&dde_search.dde_key); + avl_insert(&ddt->ddt_tree, dde, where); + } + + while (dde->dde_loading) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + if (dde->dde_loaded) + return (dde); + + dde->dde_loading = B_TRUE; + + ddt_exit(ddt); + + error = ENOENT; + + for (type = 0; type < DDT_TYPES; type++) { + for (class = 0; class < DDT_CLASSES; class++) { + error = ddt_object_lookup(ddt, type, class, dde); + if (error != ENOENT) + break; + } + if (error != ENOENT) + break; + } + + ASSERT(error == 0 || error == ENOENT); + + ddt_enter(ddt); + + ASSERT(dde->dde_loaded == B_FALSE); + ASSERT(dde->dde_loading == B_TRUE); + + dde->dde_type = type; /* will be DDT_TYPES if no entry found */ + dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + dde->dde_loaded = B_TRUE; + dde->dde_loading = B_FALSE; + + if (error == 0) + ddt_stat_update(ddt, dde, -1ULL); + + cv_broadcast(&dde->dde_cv); + + return (dde); +} + +void +ddt_prefetch(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) + return; + + /* + * We only remove the DDT once all tables are empty and only + * prefetch dedup blocks when there are entries in the DDT. + * Thus no locking is required as the DDT can't disappear on us. + */ + ddt = ddt_select(spa, bp); + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &dde); + } + } +} + +int +ddt_entry_compare(const void *x1, const void *x2) +{ + const ddt_entry_t *dde1 = x1; + const ddt_entry_t *dde2 = x2; + const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; + const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; + + for (int i = 0; i < DDT_KEY_WORDS; i++) { + if (u1[i] < u2[i]) + return (-1); + if (u1[i] > u2[i]) + return (1); + } + + return (0); +} + +static ddt_t * +ddt_table_alloc(spa_t *spa, enum zio_checksum c) +{ + ddt_t *ddt; + + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + + mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&ddt->ddt_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; + ddt->ddt_spa = spa; + ddt->ddt_os = spa->spa_meta_objset; + + return (ddt); +} + +static void +ddt_table_free(ddt_t *ddt) +{ + ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); + ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + avl_destroy(&ddt->ddt_tree); + avl_destroy(&ddt->ddt_repair_tree); + mutex_destroy(&ddt->ddt_lock); + kmem_free(ddt, sizeof (*ddt)); +} + +void +ddt_create(spa_t *spa) +{ + spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); +} + +int +ddt_load(spa_t *spa) +{ + int error; + + ddt_create(spa); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + + if (error) + return (error == ENOENT ? 0 : error); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + error = ddt_object_load(ddt, type, class); + if (error != 0 && error != ENOENT) + return (error); + } + } + + /* + * Seed the cached histograms. + */ + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + } + + return (0); +} + +void +ddt_unload(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (spa->spa_ddt[c]) { + ddt_table_free(spa->spa_ddt[c]); + spa->spa_ddt[c] = NULL; + } + } +} + +boolean_t +ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) + for (enum ddt_class class = 0; class <= max_class; class++) + if (ddt_object_lookup(ddt, type, class, &dde) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +ddt_entry_t * +ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) +{ + ddt_key_t ddk; + ddt_entry_t *dde; + + ddt_key_fill(&ddk, bp); + + dde = ddt_alloc(&ddk); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + /* + * We can only do repair if there are multiple copies + * of the block. For anything in the UNIQUE class, + * there's definitely only one copy, so don't even try. + */ + if (class != DDT_CLASS_UNIQUE && + ddt_object_lookup(ddt, type, class, dde) == 0) + return (dde); + } + } + + bzero(dde->dde_phys, sizeof (dde->dde_phys)); + + return (dde); +} + +void +ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) +{ + avl_index_t where; + + ddt_enter(ddt); + + if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) + avl_insert(&ddt->ddt_repair_tree, dde, where); + else + ddt_free(dde); + + ddt_exit(ddt); +} + +static void +ddt_repair_entry_done(zio_t *zio) +{ + ddt_entry_t *rdde = zio->io_private; + + ddt_free(rdde); +} + +static void +ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) +{ + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *rddp = rdde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + ddt_key_t *rddk = &rdde->dde_key; + zio_t *zio; + blkptr_t blk; + + zio = zio_null(rio, rio->io_spa, NULL, + ddt_repair_entry_done, rdde, rio->io_flags); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth != rddp->ddp_phys_birth || + bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, + rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + } + + zio_nowait(zio); +} + +static void +ddt_repair_table(ddt_t *ddt, zio_t *rio) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde, *rdde_next, *rdde; + avl_tree_t *t = &ddt->ddt_repair_tree; + blkptr_t blk; + + if (spa_sync_pass(spa) > 1) + return; + + ddt_enter(ddt); + for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { + rdde_next = AVL_NEXT(t, rdde); + avl_remove(&ddt->ddt_repair_tree, rdde); + ddt_exit(ddt); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + dde = ddt_repair_start(ddt, &blk); + ddt_repair_entry(ddt, dde, rdde, rio); + ddt_repair_done(ddt, dde); + ddt_enter(ddt); + } + ddt_exit(ddt); +} + +static void +ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +{ + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + enum ddt_type otype = dde->dde_type; + enum ddt_type ntype = DDT_TYPE_CURRENT; + enum ddt_class oclass = dde->dde_class; + enum ddt_class nclass; + uint64_t total_refcnt = 0; + + ASSERT(dde->dde_loaded); + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT((int64_t)ddp->ddp_refcnt >= 0); + if (ddp->ddp_phys_birth == 0) { + ASSERT(ddp->ddp_refcnt == 0); + continue; + } + if (p == DDT_PHYS_DITTO) { + if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + continue; + } + if (ddp->ddp_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + total_refcnt += ddp->ddp_refcnt; + } + + if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) + nclass = DDT_CLASS_DITTO; + else if (total_refcnt > 1) + nclass = DDT_CLASS_DUPLICATE; + else + nclass = DDT_CLASS_UNIQUE; + + if (otype != DDT_TYPES && + (otype != ntype || oclass != nclass || total_refcnt == 0)) { + VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); + ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + } + + if (total_refcnt != 0) { + dde->dde_type = ntype; + dde->dde_class = nclass; + ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) + ddt_object_create(ddt, ntype, nclass, tx); + VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } + } +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde; + void *cookie = NULL; + + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object, tx) == 0); + } + + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ddt_sync_entry(ddt, dde, tx, txg); + ddt_free(dde); + } + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + uint64_t count = 0; + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + count += ddt_object_count(ddt, type, class); + } + } + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (count == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + } + + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); +} + +void +ddt_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + zio_t *rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + ASSERT(spa_syncing_txg(spa) == txg); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_sync_table(ddt, tx, txg); + ddt_repair_table(ddt, rio); + } + + (void) zio_wait(rio); + + dmu_tx_commit(tx); +} + +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +{ + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (ENOENT); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c new file mode 100644 index 0000000..6812aa3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c @@ -0,0 +1,156 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +int ddt_zap_leaf_blockshift = 12; +int ddt_zap_indirect_blockshift = 12; + +static int +ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) +{ + zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; + + if (prehash) + flags |= ZAP_FLAG_PRE_HASHED_KEY; + + *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, + ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + DMU_OT_NONE, 0, tx); + + return (*objectp == 0 ? ENOTSUP : 0); +} + +static int +ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + return (zap_destroy(os, object, tx)); +} + +static int +ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t one, csize; + int error; + + error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, &one, &csize); + if (error) + return (error); + + ASSERT(one == 1); + ASSERT(csize <= sizeof (cbuf)); + + error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf); + if (error) + return (error); + + ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); + + return (0); +} + +static void +ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS); +} + +static int +ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize; + + csize = ddt_compress(dde->dde_phys, cbuf, + sizeof (dde->dde_phys), sizeof (cbuf)); + + return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf, tx)); +} + +static int +ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, tx)); +} + +static int +ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + zap_cursor_init_serialized(&zc, os, object, *walk); + if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize = za.za_num_integers; + ASSERT(za.za_integer_length == 1); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, + DDT_KEY_WORDS, 1, csize, cbuf); + ASSERT(error == 0); + if (error == 0) { + ddt_decompress(cbuf, dde->dde_phys, csize, + sizeof (dde->dde_phys)); + dde->dde_key = *(ddt_key_t *)za.za_name; + } + zap_cursor_advance(&zc); + *walk = zap_cursor_serialize(&zc); + } + zap_cursor_fini(&zc); + return (error); +} + +static uint64_t +ddt_zap_count(objset_t *os, uint64_t object) +{ + uint64_t count = 0; + + VERIFY(zap_count(os, object, &count) == 0); + + return (count); +} + +const ddt_ops_t ddt_zap_ops = { + "zap", + ddt_zap_create, + ddt_zap_destroy, + ddt_zap_lookup, + ddt_zap_prefetch, + ddt_zap_update, + ddt_zap_remove, + ddt_zap_walk, + ddt_zap_count, +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 26b4e5f..56e284a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -40,7 +39,10 @@ #include #include #include +#include +#ifdef _KERNEL #include +#endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, @@ -48,8 +50,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint64_array, TRUE, "object array" }, { byteswap_uint8_array, TRUE, "packed nvlist" }, { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bplist" }, - { byteswap_uint64_array, TRUE, "bplist header" }, + { byteswap_uint64_array, TRUE, "bpobj" }, + { byteswap_uint64_array, TRUE, "bpobj header" }, { byteswap_uint64_array, TRUE, "SPA space map header" }, { byteswap_uint64_array, TRUE, "SPA space map" }, { byteswap_uint64_array, TRUE, "ZIL intent log" }, @@ -81,21 +83,38 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "FUID table" }, { byteswap_uint64_array, TRUE, "FUID table size" }, { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scrub work queue" }, + { zap_byteswap, TRUE, "scan work queue" }, { zap_byteswap, TRUE, "ZFS user/group used" }, { zap_byteswap, TRUE, "ZFS user/group quota" }, + { zap_byteswap, TRUE, "snapshot refcount tags"}, + { zap_byteswap, TRUE, "DDT ZAP algorithm" }, + { zap_byteswap, TRUE, "DDT statistics" }, + { byteswap_uint8_array, TRUE, "System attributes" }, + { zap_byteswap, TRUE, "SA master node" }, + { zap_byteswap, TRUE, "SA attr registration" }, + { zap_byteswap, TRUE, "SA attr layouts" }, + { zap_byteswap, TRUE, "scan translations" }, + { byteswap_uint8_array, FALSE, "deduplicated block" }, + { zap_byteswap, TRUE, "DSL deadlist map" }, + { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, + { zap_byteswap, TRUE, "DSL dir clones" }, + { byteswap_uint64_array, TRUE, "bpobj subobj" }, }; int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp) + void *tag, dmu_buf_t **dbp, int flags) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, offset); @@ -105,7 +124,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, if (db == NULL) { err = EIO; } else { - err = dbuf_read(db, NULL, DB_RF_CANFAIL); + err = dbuf_read(db, NULL, db_flags); if (err) { dbuf_rele(db, tag); db = NULL; @@ -113,7 +132,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, } dnode_rele(dn, FTAG); - *dbp = &db->db; + *dbp = &db->db; /* NULL db plus first field offset is NULL */ return (err); } @@ -124,16 +143,79 @@ dmu_bonus_max(void) } int -dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; - if (dn->dn_bonus != (dmu_buf_impl_t *)db) - return (EINVAL); - if (newsize < 0 || newsize > db->db_size) - return (EINVAL); - dnode_setbonuslen(dn, newsize, tx); - return (0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = EINVAL; + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = EINVAL; + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +int +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (type > DMU_OT_NUMTYPES) { + error = EINVAL; + } else if (dn->dn_bonus != db) { + error = EINVAL; + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); +} + +int +dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + dbuf_rm_spill(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_rm_spill(dn, tx); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); } /* @@ -146,7 +228,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) dmu_buf_impl_t *db; int error; - error = dnode_hold(os->os, object, FTAG, &dn); + error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); @@ -158,21 +240,105 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) dbuf_create_bonus(dn); } db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); /* as long as the bonus buf is held, the dnode will be held */ - if (refcount_add(&db->db_holds, tag) == 1) + if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* + * returns ENOENT, EIO, or 0. + * + * This interface will allocate a blank spill dbuf when a spill blk + * doesn't already exist on the dnode. + * + * if you only want to find an already existing spill db, then + * dmu_spill_hold_existing() should be used. + */ +int +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = NULL; + int err; + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + + ASSERT(db != NULL); + err = dbuf_read(db, NULL, flags); + if (err == 0) + *dbp = &db->db; + else + dbuf_rele(db, tag); + return (err); +} + +int +dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = EINVAL; + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = ENOENT; + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } + + rw_exit(&dn->dn_struct_rwlock); + } + + DB_DNODE_EXIT(db); + return (err); +} + +int +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); +} + +/* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files @@ -278,7 +444,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, dnode_t *dn; int err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); @@ -291,14 +457,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, } int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; int err; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); return (err); } @@ -331,7 +501,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; if (len == 0) { /* they're interested in the bonus buffer */ - dn = os->os->os_meta_dnode; + dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; @@ -348,7 +518,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; @@ -480,7 +650,7 @@ dmu_free_long_range(objset_t *os, uint64_t object, dnode_t *dn; int err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); @@ -495,7 +665,7 @@ dmu_free_object(objset_t *os, uint64_t object) dmu_tx_t *tx; int err; - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err != 0) return (err); @@ -523,7 +693,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; - int err = dnode_hold(os->os, object, FTAG, &dn); + int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); @@ -541,7 +711,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_t **dbp; int numbufs, err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); @@ -634,12 +804,157 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +void +dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + + dmu_buf_will_not_fill(db, tx); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +int +dmu_xuio_init(xuio_t *xuio, int nblk) +{ + dmu_xuio_t *priv; + uio_t *uio = &xuio->xu_uio; + + uio->uio_iovcnt = nblk; + uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + + priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); + priv->cnt = nblk; + priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); + priv->iovp = uio->uio_iov; + XUIO_XUZC_PRIV(xuio) = priv; + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); + + return (0); +} + +void +dmu_xuio_fini(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int nblk = priv->cnt; + + kmem_free(priv->iovp, nblk * sizeof (iovec_t)); + kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); + kmem_free(priv, sizeof (dmu_xuio_t)); + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); +} + +/* + * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } + * and increase priv->next by 1. + */ +int +dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) +{ + struct iovec *iov; + uio_t *uio = &xuio->xu_uio; + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int i = priv->next++; + + ASSERT(i < priv->cnt); + ASSERT(off + n <= arc_buf_size(abuf)); + iov = uio->uio_iov + i; + iov->iov_base = (char *)abuf->b_data + off; + iov->iov_len = n; + priv->bufs[i] = abuf; + return (0); +} + +int +dmu_xuio_cnt(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + return (priv->cnt); +} + +arc_buf_t * +dmu_xuio_arcbuf(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + return (priv->bufs[i]); +} + +void +dmu_xuio_clear(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + priv->bufs[i] = NULL; +} + +static void +xuio_stat_init(void) +{ + xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (xuio_ksp != NULL) { + xuio_ksp->ks_data = &xuio_stats; + kstat_install(xuio_ksp); + } +} + +static void +xuio_stat_fini(void) +{ + if (xuio_ksp != NULL) { + kstat_delete(xuio_ksp); + xuio_ksp = NULL; + } +} + +void +xuio_stat_wbuf_copied() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_copied); +} + +void +xuio_stat_wbuf_nocopy() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); +} + #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; + xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice @@ -650,6 +965,11 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) if (err) return (err); +#ifdef UIO_XUIO + if (uio->uio_extflg == UIO_XUIO) + xuio = (xuio_t *)uio; +#endif + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; @@ -660,8 +980,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); + if (xuio) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); + if (!err) { + uio->uio_resid -= tocpy; + uio->uio_loffset += tocpy; + } + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else { + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + } if (err) break; @@ -672,19 +1008,16 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) return (err); } -int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx) +static int +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; - int numbufs, i; + int numbufs; int err = 0; + int i; - if (size == 0) - return (0); - - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp); + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); @@ -722,11 +1055,52 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, size -= tocpy; } + dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } -#ifndef __FreeBSD__ +int +dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); + + return (err); +} + +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_write_uio_dnode(dn, uio, size, tx); + + dnode_rele(dn, FTAG); + + return (err); +} + +#ifdef sun int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) @@ -781,8 +1155,8 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } -#endif /* !__FreeBSD__ */ -#endif /* _KERNEL */ +#endif /* sun */ +#endif /* * Allocate a loaned anonymous arc buffer. @@ -790,9 +1164,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; + spa_t *spa; - return (arc_loan_buf(dn->dn_objset->os_spa, size)); + DB_GET_SPA(&spa, db); + return (arc_loan_buf(spa, size)); } /* @@ -814,78 +1190,147 @@ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_size(buf); uint64_t blkid; + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(dbuf); if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { + objset_t *os; + uint64_t object; + + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); + os = dn->dn_objset; + object = dn->dn_object; + DB_DNODE_EXIT(dbuf); + dbuf_rele(db, FTAG); - ASSERT(dn->dn_objset->os.os == dn->dn_objset); - dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz, - buf->b_data, tx); + dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); + XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } typedef struct { - dbuf_dirty_record_t *dr; - dmu_sync_cb_t *done; - void *arg; + dbuf_dirty_record_t *dsa_dr; + dmu_sync_cb_t *dsa_done; + zgd_t *dsa_zgd; + dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { + dmu_sync_arg_t *dsa = varg; + dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; - dmu_sync_arg_t *in = varg; - dbuf_dirty_record_t *dr = in->dr; - dmu_buf_impl_t *db = dr->dr_dbuf; - if (!BP_IS_HOLE(bp)) { - ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); - ASSERT(BP_GET_LEVEL(bp) == 0); - bp->blk_fill = 1; - } else { - /* - * dmu_sync() can compress a block of zeros to a null blkptr - * but the block size still needs to be passed through to replay - */ - BP_SET_LSIZE(bp, db->db.db_size); + if (zio->io_error == 0) { + if (BP_IS_HOLE(bp)) { + /* + * A block of zeros may compress to a hole, but the + * block size still needs to be known for replay. + */ + BP_SET_LSIZE(bp, db->db_size); + } else { + ASSERT(BP_GET_LEVEL(bp) == 0); + bp->blk_fill = 1; + } } } +static void +dmu_sync_late_arrival_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { - dmu_sync_arg_t *in = varg; - dbuf_dirty_record_t *dr = in->dr; + dmu_sync_arg_t *dsa = varg; + dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; - dmu_sync_cb_t *done = in->done; mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); - dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ - if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) - BP_ZERO(&dr->dt.dl.dr_overridden_by); - dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + if (zio->io_error == 0) { + dr->dt.dl.dr_overridden_by = *zio->io_bp; + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + dr->dt.dl.dr_copies = zio->io_prop.zp_copies; + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) + BP_ZERO(&dr->dt.dl.dr_overridden_by); + } else { + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); - if (done) - done(&(db->db), in->arg); + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static void +dmu_sync_late_arrival_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *dsa = zio->io_private; - kmem_free(in, sizeof (dmu_sync_arg_t)); + if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } + + dmu_tx_commit(dsa->dsa_tx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static int +dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, + zio_prop_t *zp, zbookmark_t *zb) +{ + dmu_sync_arg_t *dsa; + dmu_tx_t *tx; + + tx = dmu_tx_create(os); + dmu_tx_hold_space(tx, zgd->zgd_db->db_size); + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + dmu_tx_abort(tx); + return (EIO); /* Make zl_get_data do txg_waited_synced() */ + } + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = NULL; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = tx; + + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + + return (0); } /* @@ -904,157 +1349,112 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * - * EINPROGRESS: the IO has been initiated. - * The caller should log this blkptr in the callback. + * EIO: could not do the I/O. + * The caller should do a txg_wait_synced(). * - * 0: completed. Sets *bp to the blkptr just written. - * The caller should log this blkptr immediately. + * 0: the I/O has been initiated. + * The caller should log this blkptr in the done callback. + * It is possible that the I/O will fail, in which case + * the error will be reported to the done callback and + * propagated to pio from zio_done(). */ int -dmu_sync(zio_t *pio, dmu_buf_t *db_fake, - blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) +dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - objset_impl_t *os = db->db_objset; - dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; - tx_state_t *tx = &dp->dp_tx; + blkptr_t *bp = zgd->zgd_bp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; + objset_t *os = db->db_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; - dmu_sync_arg_t *in; + dmu_sync_arg_t *dsa; zbookmark_t zb; - writeprops_t wp = { 0 }; - zio_t *zio; - int err; + zio_prop_t zp; + dnode_t *dn; + ASSERT(pio != NULL); ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); - dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", - txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); /* - * XXX - would be nice if we could do this without suspending... + * If we're frozen (running ziltest), we always need to generate a bp. */ - txg_suspend(dp); + if (txg > spa_freeze_txg(os->os_spa)) + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* - * If this txg already synced, there's nothing to do. + * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() + * and us. If we determine that this txg is not yet syncing, + * but it begins to sync a moment later, that's OK because the + * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ - if (txg <= tx->tx_synced_txg) { - txg_resume(dp); + mutex_enter(&db->db_mtx); + + if (txg <= spa_last_synced_txg(os->os_spa)) { /* - * If we're running ziltest, we need the blkptr regardless. + * This txg has already synced. There's nothing to do. */ - if (txg > spa_freeze_txg(dp->dp_spa)) { - /* if db_blkptr == NULL, this was an empty write */ - if (db->db_blkptr) - *bp = *db->db_blkptr; /* structure assignment */ - return (0); - } + mutex_exit(&db->db_mtx); return (EEXIST); } - mutex_enter(&db->db_mtx); - - if (txg == tx->tx_syncing_txg) { - while (db->db_data_pending) { - /* - * IO is in-progress. Wait for it to finish. - * XXX - would be nice to be able to somehow "attach" - * this zio to the parent zio passed in. - */ - cv_wait(&db->db_changed, &db->db_mtx); - if (!db->db_data_pending && - db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { - /* - * IO was compressed away - */ - *bp = *db->db_blkptr; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } - ASSERT(db->db_data_pending || - (db->db_blkptr && db->db_blkptr->blk_birth == txg)); - } - - if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { - /* - * IO is already completed. - */ - *bp = *db->db_blkptr; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } + if (txg <= spa_syncing_txg(os->os_spa)) { + /* + * This txg is currently syncing, so we can't mess with + * the dirty record anymore; just write a new log block. + */ + mutex_exit(&db->db_mtx); + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; - while (dr && dr->dr_txg > txg) + while (dr && dr->dr_txg != txg) dr = dr->dr_next; - if (dr == NULL || dr->dr_txg < txg) { + + if (dr == NULL) { /* - * This dbuf isn't dirty, must have been free_range'd. + * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); - txg_resume(dp); return (ENOENT); } ASSERT(dr->dr_txg == txg); - if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* - * We have already issued a sync write for this buffer. - */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (EALREADY); - } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - /* - * This buffer has already been synced. It could not + * We have already issued a sync write for this buffer, + * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ - *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); + return (EALREADY); } + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; - in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); - in->dr = dr; - in->done = done; - in->arg = arg; mutex_exit(&db->db_mtx); - txg_resume(dp); - zb.zb_objset = os->os_dsl_dataset->ds_object; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = NULL; - wp.wp_type = db->db_dnode->dn_type; - wp.wp_level = db->db_level; - wp.wp_copies = os->os_copies; - wp.wp_dnchecksum = db->db_dnode->dn_checksum; - wp.wp_oschecksum = os->os_checksum; - wp.wp_dncompress = db->db_dnode->dn_compress; - wp.wp_oscompress = os->os_compress; + zio_nowait(arc_write(pio, os->os_spa, txg, + bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, + dmu_sync_ready, dmu_sync_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); - ASSERT(BP_IS_HOLE(bp)); - - zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db), - txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - - if (pio) { - zio_nowait(zio); - err = EINPROGRESS; - } else { - err = zio_wait(zio); - ASSERT(err == 0); - } - return (err); + return (0); } int @@ -1064,7 +1464,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dnode_t *dn; int err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); @@ -1079,7 +1479,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dnode_t *dn; /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os->os, object, FTAG, &dn); + (void) dnode_hold(os, object, FTAG, &dn); ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); @@ -1093,20 +1493,103 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dnode_t *dn; /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os->os, object, FTAG, &dn); + (void) dnode_hold(os, object, FTAG, &dn); ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } +int zfs_mdcomp_disable = 0; +TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW, + &zfs_mdcomp_disable, 0, "Disable metadata compression"); + +void +dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) +{ + dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; + boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || + (wp & WP_SPILL)); + enum zio_checksum checksum = os->os_checksum; + enum zio_compress compress = os->os_compress; + enum zio_checksum dedup_checksum = os->os_dedup_checksum; + boolean_t dedup; + boolean_t dedup_verify = os->os_dedup_verify; + int copies = os->os_copies; + + /* + * Determine checksum setting. + */ + if (ismd) { + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (zio_checksum_table[checksum].ci_correctable < 1 || + zio_checksum_table[checksum].ci_eck) + checksum = ZIO_CHECKSUM_FLETCHER_4; + } else { + checksum = zio_checksum_select(dn->dn_checksum, checksum); + } + + /* + * Determine compression setting. + */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + } else { + compress = zio_compress_select(dn->dn_compress, compress); + } + + /* + * Determine dedup setting. If we are in dmu_sync(), we won't + * actually dedup now because that's all done in syncing context; + * but we do want to use the dedup checkum. If the checksum is not + * strong enough to ensure unique signatures, force dedup_verify. + */ + dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); + if (dedup) { + checksum = dedup_checksum; + if (!zio_checksum_table[checksum].ci_dedup) + dedup_verify = 1; + } + + if (wp & WP_DMU_SYNC) + dedup = 0; + + if (wp & WP_NOFILL) { + ASSERT(!ismd && level == 0); + checksum = ZIO_CHECKSUM_OFF; + compress = ZIO_COMPRESS_OFF; + dedup = B_FALSE; + } + + zp->zp_checksum = checksum; + zp->zp_compress = compress; + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; + zp->zp_level = level; + zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); + zp->zp_dedup = dedup; + zp->zp_dedup_verify = dedup && dedup_verify; +} + int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int i, err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* @@ -1120,7 +1603,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) if (i != TXG_SIZE) { dnode_rele(dn, FTAG); txg_wait_synced(dmu_objset_pool(os), 0); - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); } @@ -1134,21 +1617,27 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { + dnode_phys_t *dnp; + rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); + dnp = dn->dn_phys; + doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; + doi->doi_type = dn->dn_type; + doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; - doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + - SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; - doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; - doi->doi_type = dn->dn_type; - doi->doi_bonus_size = dn->dn_bonuslen; - doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; + doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; + doi->doi_fill_count = 0; + for (int i = 0; i < dnp->dn_nblkptr; i++) + doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); @@ -1162,7 +1651,7 @@ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; - int err = dnode_hold(os->os, object, FTAG, &dn); + int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); @@ -1178,9 +1667,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) * As above, but faster; can be used when you have a held dbuf in hand. */ void -dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) +dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { - dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + DB_DNODE_ENTER(db); + dmu_object_info_from_dnode(DB_DNODE(db), doi); + DB_DNODE_EXIT(db); } /* @@ -1188,14 +1681,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) * This is specifically optimized for zfs_getattr(). */ void -dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) +dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, + u_longlong_t *nblk512) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; + DB_DNODE_EXIT(db); } void @@ -1246,8 +1745,12 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { - dbuf_init(); + zfs_dbgmsg_init(); + sa_cache_init(); + xuio_stat_init(); + dmu_objset_init(); dnode_init(); + dbuf_init(); zfetch_init(); arc_init(); l2arc_init(); @@ -1256,9 +1759,13 @@ dmu_init(void) void dmu_fini(void) { + l2arc_fini(); arc_fini(); zfetch_fini(); - dnode_fini(); dbuf_fini(); - l2arc_fini(); + dnode_fini(); + dmu_objset_fini(); + xuio_stat_fini(); + sa_cache_fini(); + zfs_dbgmsg_fini(); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c new file mode 100644 index 0000000..c72a28b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct diffarg { + struct file *da_fp; /* file to which we are reporting */ + offset_t *da_offp; + int da_err; /* error that stopped diff search */ + dmu_diff_record_t da_ddr; + kthread_t *da_td; +}; + +static int +write_bytes(struct diffarg *da) +{ + struct uio auio; + struct iovec aiov; + + aiov.iov_base = (caddr_t)&da->da_ddr; + aiov.iov_len = sizeof (da->da_ddr); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = aiov.iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_offset = (off_t)-1; + auio.uio_td = da->da_td; +#ifdef _KERNEL + if (da->da_fp->f_type == DTYPE_VNODE) + bwillwrite(); + return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td)); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + return (EOPNOTSUPP); +#endif +} + +static int +write_record(struct diffarg *da) +{ + + if (da->da_ddr.ddr_type == DDR_NONE) { + da->da_err = 0; + return (0); + } + + da->da_err = write_bytes(da); + *da->da_offp += sizeof (da->da_ddr); + return (da->da_err); +} + +static int +report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) +{ + ASSERT(first <= last); + if (da->da_ddr.ddr_type != DDR_FREE || + first != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_FREE; + da->da_ddr.ddr_first = first; + da->da_ddr.ddr_last = last; + return (0); + } + da->da_ddr.ddr_last = last; + return (0); +} + +static int +report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) +{ + ASSERT(dnp != NULL); + if (dnp->dn_type == DMU_OT_NONE) + return (report_free_dnode_range(da, object, object)); + + if (da->da_ddr.ddr_type != DDR_INUSE || + object != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_INUSE; + da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; + return (0); + } + da->da_ddr.ddr_last = object; + return (0); +} + +#define DBP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +/* ARGSUSED */ +static int +diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct diffarg *da = arg; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + if (zb->zb_object != DMU_META_DNODE_OBJECT) + return (0); + + if (bp == NULL) { + uint64_t span = DBP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + + err = report_free_dnode_range(da, dnobj, + dnobj + (span >> DNODE_SHIFT) - 1); + if (err) + return (err); + } else if (zb->zb_level == 0) { + dnode_phys_t *blk; + arc_buf_t *abuf; + uint32_t aflags = ARC_WAIT; + int blksz = BP_GET_LSIZE(bp); + int i; + + if (dsl_read(NULL, spa, bp, pbuf, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + blk = abuf->b_data; + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = report_dnode(da, dnobj, blk+i); + if (err) + break; + } + (void) arc_buf_remove_ref(abuf, &abuf); + if (err) + return (err); + /* Don't care about the data blocks */ + return (TRAVERSE_VISIT_NO_CHILDREN); + } + return (0); +} + +int +dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct file *fp, offset_t *offp) +{ + struct diffarg da; + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; + dsl_dataset_t *findds; + dsl_dataset_t *relds; + int err = 0; + + /* make certain we are looking at snapshots */ + if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) + return (EINVAL); + + /* fromsnap must be earlier and from the same lineage as tosnap */ + if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) + return (EXDEV); + + relds = NULL; + findds = ds; + + while (fromds->ds_dir != findds->ds_dir) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (!dsl_dir_is_clone(findds->ds_dir)) { + if (relds) + dsl_dataset_rele(relds, FTAG); + return (EXDEV); + } + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); + rw_exit(&dp->dp_config_rwlock); + + if (relds) + dsl_dataset_rele(relds, FTAG); + + if (err) + return (EXDEV); + + relds = findds; + } + + if (relds) + dsl_dataset_rele(relds, FTAG); + + da.da_fp = fp; + da.da_offp = offp; + da.da_ddr.ddr_type = DDR_NONE; + da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; + da.da_err = 0; + da.da_td = curthread; + + err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); + + if (err) { + da.da_err = err; + } else { + /* we set the da.da_err we return as side-effect */ + (void) write_record(&da); + } + + return (da.da_err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c index 1f91fc1..8dff460 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -32,16 +31,15 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - objset_impl_t *osi = os->os; uint64_t object; uint64_t L2_dnode_count = DNODES_PER_BLOCK << - (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int restarted = B_FALSE; - mutex_enter(&osi->os_obj_lock); + mutex_enter(&os->os_obj_lock); for (;;) { - object = osi->os_obj_next; + object = os->os_obj_next; /* * Each time we polish off an L2 bp worth of dnodes * (2^13 objects), move to another L2 bp that's still @@ -51,14 +49,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, */ if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; - int error = dnode_next_offset(osi->os_meta_dnode, + int error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; if (error == 0) object = offset >> DNODE_SHIFT; } - osi->os_obj_next = ++object; + os->os_obj_next = ++object; /* * XXX We should check for an i/o error here and return @@ -66,19 +64,19 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, * dmu_tx_assign(), but there is currently no mechanism * to do so. */ - (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, + (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (dn) break; if (dmu_object_next(os, &object, B_TRUE, 0) == 0) - osi->os_obj_next = object - 1; + os->os_obj_next = object - 1; } dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); dnode_rele(dn, FTAG); - mutex_exit(&osi->os_obj_lock); + mutex_exit(&os->os_obj_lock); dmu_tx_add_new_object(tx, os, object); return (object); @@ -94,7 +92,7 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (EBADF); - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (err) return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); @@ -116,7 +114,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, if (object == DMU_META_DNODE_OBJECT) return (EBADF); - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); @@ -128,7 +126,11 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, return (0); } - nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (bonustype == DMU_OT_SA) { + nblkptr = 1; + } else { + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + } /* * If we are losing blkptrs or changing the block size this must @@ -166,7 +168,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); @@ -185,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) uint64_t offset = (*objectp + 1) << DNODE_SHIFT; int error; - error = dnode_next_offset(os->os->os_meta_dnode, + error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 2678b83..09d13db 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #include #include #include @@ -36,22 +37,41 @@ #include #include #include -#include #include #include #include #include +#include +#include + +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} spa_t * dmu_objset_spa(objset_t *os) { - return (os->os->os_spa); + return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { - return (os->os->os_zil); + return (os->os_zil); } dsl_pool_t * @@ -59,82 +79,112 @@ dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; - if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir) + if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else - return (spa_get_dsl(os->os->os_spa)); + return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { - return (os->os->os_dsl_dataset); + return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { - return (os->os->os_phys->os_type); + return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { - dsl_dataset_name(os->os->os_dsl_dataset, buf); + dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } +uint64_t +dmu_objset_syncprop(objset_t *os) +{ + return (os->os_sync); +} + +uint64_t +dmu_objset_logbias(objset_t *os) +{ + return (os->os_logbias); +} + static void checksum_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); - osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); + os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); - osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); + os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); } static void copies_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); - ASSERT(newval <= spa_max_replication(osi->os_spa)); + ASSERT(newval <= spa_max_replication(os->os_spa)); - osi->os_copies = newval; + os->os_copies = newval; +} + +static void +dedup_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + spa_t *spa = os->os_spa; + enum zio_checksum checksum; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); + + os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; + os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. @@ -142,13 +192,13 @@ primary_cache_changed_cb(void *arg, uint64_t newval) ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); - osi->os_primary_cache = newval; + os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. @@ -156,7 +206,35 @@ secondary_cache_changed_cb(void *arg, uint64_t newval) ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); - osi->os_secondary_cache = newval; + os->os_secondary_cache = newval; +} + +static void +sync_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || + newval == ZFS_SYNC_DISABLED); + + os->os_sync = newval; + if (os->os_zil) + zil_set_sync(os->os_zil, newval); +} + +static void +logbias_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + ASSERT(newval == ZFS_LOGBIAS_LATENCY || + newval == ZFS_LOGBIAS_THROUGHPUT); + os->os_logbias = newval; + if (os->os_zil) + zil_set_logbias(os->os_zil, newval); } void @@ -177,39 +255,37 @@ dmu_objset_byteswap(void *buf, size_t size) int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - objset_impl_t **osip) + objset_t **osp) { - objset_impl_t *osi; + objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); - osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); - osi->os.os = osi; - osi->os_dsl_dataset = ds; - osi->os_spa = spa; - osi->os_rootbp = bp; - if (!BP_IS_HOLE(osi->os_rootbp)) { + os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); + os->os_dsl_dataset = ds; + os->os_spa = spa; + os->os_rootbp = bp; + if (!BP_IS_HOLE(os->os_rootbp)) { uint32_t aflags = ARC_WAIT; zbookmark_t zb; - zb.zb_objset = ds ? ds->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; - if (DMU_OS_IS_L2CACHEABLE(osi)) + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_L2CACHE; - dprintf_bp(osi->os_rootbp, "reading %s", ""); + dprintf_bp(os->os_rootbp, "reading %s", ""); /* - * NB: when bprewrite scrub can change the bp, + * XXX when bprewrite scrub can change the bp, * and this is called from dmu_objset_open_ds_os, the bp * could change, and we'll need a lock. */ - err = arc_read_nolock(NULL, spa, osi->os_rootbp, - arc_getbuf_func, &osi->os_phys_buf, + err = dsl_read_nolock(NULL, spa, os->os_rootbp, + arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { - kmem_free(osi, sizeof (objset_impl_t)); + kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = EIO; @@ -218,27 +294,27 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && - arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) { + arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { arc_buf_t *buf = arc_buf_alloc(spa, - sizeof (objset_phys_t), &osi->os_phys_buf, + sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); - bcopy(osi->os_phys_buf->b_data, buf->b_data, - arc_buf_size(osi->os_phys_buf)); - (void) arc_buf_remove_ref(osi->os_phys_buf, - &osi->os_phys_buf); - osi->os_phys_buf = buf; + bcopy(os->os_phys_buf->b_data, buf->b_data, + arc_buf_size(os->os_phys_buf)); + (void) arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf); + os->os_phys_buf = buf; } - osi->os_phys = osi->os_phys_buf->b_data; - osi->os_flags = osi->os_phys->os_flags; + os->os_phys = os->os_phys_buf->b_data; + os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - osi->os_phys_buf = arc_buf_alloc(spa, size, - &osi->os_phys_buf, ARC_BUFC_METADATA); - osi->os_phys = osi->os_phys_buf->b_data; - bzero(osi->os_phys, size); + os->os_phys_buf = arc_buf_alloc(spa, size, + &os->os_phys_buf, ARC_BUFC_METADATA); + os->os_phys = os->os_phys_buf->b_data; + bzero(os->os_phys, size); } /* @@ -249,61 +325,78 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, */ if (ds) { err = dsl_prop_register(ds, "primarycache", - primary_cache_changed_cb, osi); + primary_cache_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "secondarycache", - secondary_cache_changed_cb, osi); + secondary_cache_changed_cb, os); if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) err = dsl_prop_register(ds, "checksum", - checksum_changed_cb, osi); + checksum_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "compression", - compression_changed_cb, osi); + compression_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "copies", - copies_changed_cb, osi); + copies_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "dedup", + dedup_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "logbias", + logbias_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "sync", + sync_changed_cb, os); } if (err) { - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, - &osi->os_phys_buf) == 1); - kmem_free(osi, sizeof (objset_impl_t)); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf) == 1); + kmem_free(os, sizeof (objset_t)); return (err); } } else if (ds == NULL) { /* It's the meta-objset. */ - osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_compress = ZIO_COMPRESS_LZJB; - osi->os_copies = spa_max_replication(spa); - osi->os_primary_cache = ZFS_CACHE_ALL; - osi->os_secondary_cache = ZFS_CACHE_ALL; + os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + os->os_compress = ZIO_COMPRESS_LZJB; + os->os_copies = spa_max_replication(spa); + os->os_dedup_checksum = ZIO_CHECKSUM_OFF; + os->os_dedup_verify = 0; + os->os_logbias = 0; + os->os_sync = 0; + os->os_primary_cache = ZFS_CACHE_ALL; + os->os_secondary_cache = ZFS_CACHE_ALL; } - osi->os_zil_header = osi->os_phys->os_zil_header; - osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header); + if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + os->os_zil_header = os->os_phys->os_zil_header; + os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { - list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), + list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); - list_create(&osi->os_free_dnodes[i], sizeof (dnode_t), + list_create(&os->os_free_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); } - list_create(&osi->os_dnodes, sizeof (dnode_t), + list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); - list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); - mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - - osi->os_meta_dnode = dnode_special_open(osi, - &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); - if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) { - osi->os_userused_dnode = dnode_special_open(osi, - &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT); - osi->os_groupused_dnode = dnode_special_open(osi, - &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT); + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + + DMU_META_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, + &os->os_meta_dnode); + if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { + DMU_USERUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, + &os->os_userused_dnode); + DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, + &os->os_groupused_dnode); } /* @@ -311,117 +404,96 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * have ds_opening_lock */ if (ds) { - VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi, - dmu_objset_evict)); + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); } - *osip = osi; + *osp = os; return (0); } -static int -dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type) +int +dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { - objset_impl_t *osi; + int err = 0; mutex_enter(&ds->ds_opening_lock); - osi = dsl_dataset_get_user_ptr(ds); - if (osi == NULL) { - int err; - + *osp = ds->ds_objset; + if (*osp == NULL) { err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, &osi); - if (err) { - mutex_exit(&ds->ds_opening_lock); - return (err); - } + ds, dsl_dataset_get_blkptr(ds), osp); } mutex_exit(&ds->ds_opening_lock); - - os->os = osi; - os->os_mode = DS_MODE_NOHOLD; - - if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) - return (EINVAL); - return (0); + return (err); } +/* called from zpl */ int -dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp) +dmu_objset_hold(const char *name, void *tag, objset_t **osp) { - objset_t *os; + dsl_dataset_t *ds; int err; - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - err = dmu_objset_open_ds_os(ds, os, type); + err = dsl_dataset_hold(name, tag, &ds); if (err) - kmem_free(os, sizeof (objset_t)); - else - *osp = os; + return (err); + + err = dmu_objset_from_ds(ds, osp); + if (err) + dsl_dataset_rele(ds, tag); + return (err); } /* called from zpl */ int -dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp) +dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) { - objset_t *os; dsl_dataset_t *ds; int err; - ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER || - DS_MODE_TYPE(mode) == DS_MODE_OWNER); - - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - if (DS_MODE_TYPE(mode) == DS_MODE_USER) - err = dsl_dataset_hold(name, os, &ds); - else - err = dsl_dataset_own(name, mode, os, &ds); - if (err) { - kmem_free(os, sizeof (objset_t)); + err = dsl_dataset_own(name, B_FALSE, tag, &ds); + if (err) return (err); - } - err = dmu_objset_open_ds_os(ds, os, type); + err = dmu_objset_from_ds(ds, osp); if (err) { - if (DS_MODE_TYPE(mode) == DS_MODE_USER) - dsl_dataset_rele(ds, os); - else - dsl_dataset_disown(ds, os); - kmem_free(os, sizeof (objset_t)); - } else { - os->os_mode = mode; - *osp = os; + dsl_dataset_disown(ds, tag); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + dmu_objset_disown(*osp, tag); + return (EINVAL); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + dmu_objset_disown(*osp, tag); + return (EROFS); } return (err); } void -dmu_objset_close(objset_t *os) +dmu_objset_rele(objset_t *os, void *tag) { - ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER || - DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER || - DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD); + dsl_dataset_rele(os->os_dsl_dataset, tag); +} - if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER) - dsl_dataset_rele(os->os->os_dsl_dataset, os); - else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER) - dsl_dataset_disown(os->os->os_dsl_dataset, os); - kmem_free(os, sizeof (objset_t)); +void +dmu_objset_disown(objset_t *os, void *tag) +{ + dsl_dataset_disown(os->os_dsl_dataset, tag); } int dmu_objset_evict_dbufs(objset_t *os) { - objset_impl_t *osi = os->os; dnode_t *dn; - mutex_enter(&osi->os_lock); + mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&osi->os_dnodes, osi->os_meta_dnode); - list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode); + list_remove(&os->os_dnodes, DMU_META_DNODE(os)); + list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); /* * Find the first dnode with holds. We have to do this dance @@ -429,93 +501,114 @@ dmu_objset_evict_dbufs(objset_t *os) * hold. If there are no holds then it has no dbufs so OK to * skip. */ - for (dn = list_head(&osi->os_dnodes); + for (dn = list_head(&os->os_dnodes); dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&osi->os_dnodes, dn)) + dn = list_next(&os->os_dnodes, dn)) continue; while (dn) { dnode_t *next_dn = dn; do { - next_dn = list_next(&osi->os_dnodes, next_dn); + next_dn = list_next(&os->os_dnodes, next_dn); } while (next_dn && !dnode_add_ref(next_dn, FTAG)); - mutex_exit(&osi->os_lock); + mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); - mutex_enter(&osi->os_lock); + mutex_enter(&os->os_lock); dn = next_dn; } - mutex_exit(&osi->os_lock); - return (list_head(&osi->os_dnodes) != osi->os_meta_dnode); + dn = list_head(&os->os_dnodes); + mutex_exit(&os->os_lock); + return (dn != DMU_META_DNODE(os)); } void -dmu_objset_evict(dsl_dataset_t *ds, void *arg) +dmu_objset_evict(objset_t *os) { - objset_impl_t *osi = arg; - objset_t os; - int i; + dsl_dataset_t *ds = os->os_dsl_dataset; - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); - ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); - } + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY(0 == dsl_prop_unregister(ds, "checksum", - checksum_changed_cb, osi)); + checksum_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "compression", - compression_changed_cb, osi)); + compression_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "copies", - copies_changed_cb, osi)); + copies_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "dedup", + dedup_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "logbias", + logbias_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "sync", + sync_changed_cb, os)); } VERIFY(0 == dsl_prop_unregister(ds, "primarycache", - primary_cache_changed_cb, osi)); + primary_cache_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", - secondary_cache_changed_cb, osi)); + secondary_cache_changed_cb, os)); } + if (os->os_sa) + sa_tear_down(os); + /* * We should need only a single pass over the dnode list, since * nothing can be added to the list at this point. */ - os.os = osi; - (void) dmu_objset_evict_dbufs(&os); + (void) dmu_objset_evict_dbufs(os); - dnode_special_close(osi->os_meta_dnode); - if (osi->os_userused_dnode) { - dnode_special_close(osi->os_userused_dnode); - dnode_special_close(osi->os_groupused_dnode); + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); } - zil_free(osi->os_zil); + zil_free(os->os_zil); + + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + + VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); - ASSERT3P(list_head(&osi->os_dnodes), ==, NULL); + mutex_destroy(&os->os_lock); + mutex_destroy(&os->os_obj_lock); + mutex_destroy(&os->os_user_ptr_lock); + kmem_free(os, sizeof (objset_t)); +} - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); - mutex_destroy(&osi->os_lock); - mutex_destroy(&osi->os_obj_lock); - mutex_destroy(&osi->os_user_ptr_lock); - kmem_free(osi, sizeof (objset_impl_t)); +timestruc_t +dmu_objset_snap_cmtime(objset_t *os) +{ + return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ -objset_impl_t * +objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { - objset_impl_t *osi; + objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); - if (ds) - mutex_enter(&ds->ds_opening_lock); - VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); - if (ds) - mutex_exit(&ds->ds_opening_lock); - mdn = osi->os_meta_dnode; + if (ds != NULL) + VERIFY(0 == dmu_objset_from_ds(ds, &os)); + else + VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); + + mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); @@ -550,24 +643,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); - osi->os_phys->os_type = type; - if (dmu_objset_userused_enabled(osi)) { - osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; - osi->os_flags = osi->os_phys->os_flags; + os->os_phys->os_type = type; + if (dmu_objset_userused_enabled(os)) { + os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); - return (osi); + return (os); } struct oscarg { void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *userarg; - dsl_dataset_t *clone_parent; + dsl_dataset_t *clone_origin; const char *lastname; dmu_objset_type_t type; uint64_t flags; + cred_t *cr; }; /*ARGSUSED*/ @@ -585,17 +679,13 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) if (err != ENOENT) return (err ? err : EEXIST); - if (oa->clone_parent != NULL) { - /* - * You can't clone across pools. - */ - if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool) + if (oa->clone_origin != NULL) { + /* You can't clone across pools. */ + if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) return (EXDEV); - /* - * You can only clone snapshots, not the head datasets. - */ - if (oa->clone_parent->ds_phys->ds_num_children == 0) + /* You can only clone snapshots, not the head datasets. */ + if (!dsl_dataset_is_snapshot(oa->clone_origin)) return (EINVAL); } @@ -603,41 +693,40 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; + spa_t *spa = dd->dd_pool->dp_spa; struct oscarg *oa = arg2; - dsl_dataset_t *ds; - blkptr_t *bp; - uint64_t dsobj; + uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); - dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_parent, oa->flags, cr, tx); + obj = dsl_dataset_create_sync(dd, oa->lastname, + oa->clone_origin, oa->flags, oa->cr, tx); + + if (oa->clone_origin == NULL) { + dsl_pool_t *dp = dd->dd_pool; + dsl_dataset_t *ds; + blkptr_t *bp; + objset_t *os; - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds)); - bp = dsl_dataset_get_blkptr(ds); - if (BP_IS_HOLE(bp)) { - objset_impl_t *osi; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + bp = dsl_dataset_get_blkptr(ds); + ASSERT(BP_IS_HOLE(bp)); - /* This is an empty dmu_objset; not a clone. */ - osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, bp, oa->type, tx); + os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); if (oa->userfunc) - oa->userfunc(&osi->os, oa->userarg, cr, tx); + oa->userfunc(os, oa->userarg, oa->cr, tx); + dsl_dataset_rele(ds, FTAG); } - spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dsobj); - - dsl_dataset_rele(ds, FTAG); + spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); } int -dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dsl_dir_t *pdd; @@ -654,24 +743,13 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, return (EEXIST); } - dprintf("name=%s\n", name); - oa.userfunc = func; oa.userarg = arg; oa.lastname = tail; oa.type = type; oa.flags = flags; + oa.cr = CRED(); - if (clone_parent != NULL) { - /* - * You can't clone to a different type. - */ - if (clone_parent->os->os_phys->os_type != type) { - dsl_dir_close(pdd, FTAG); - return (EINVAL); - } - oa.clone_parent = clone_parent->os->os_dsl_dataset; - } err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); dsl_dir_close(pdd, FTAG); @@ -679,67 +757,59 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, } int -dmu_objset_destroy(const char *name) +dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) { - objset_t *os; - int error; - - /* - * If it looks like we'll be able to destroy it, and there's - * an unplayed replay log sitting around, destroy the log. - * It would be nicer to do this in dsl_dataset_destroy_sync(), - * but the replay log objset is modified in open context. - */ - error = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os); - if (error == 0) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; - zil_destroy(dmu_objset_zil(os), B_FALSE); + dsl_dir_t *pdd; + const char *tail; + int err = 0; + struct oscarg oa = { 0 }; - error = dsl_dataset_destroy(ds, os); - /* - * dsl_dataset_destroy() closes the ds. - */ - kmem_free(os, sizeof (objset_t)); + ASSERT(strchr(name, '@') == NULL); + err = dsl_dir_open(name, FTAG, &pdd, &tail); + if (err) + return (err); + if (tail == NULL) { + dsl_dir_close(pdd, FTAG); + return (EEXIST); } - return (error); + oa.lastname = tail; + oa.clone_origin = clone_origin; + oa.flags = flags; + oa.cr = CRED(); + + err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, + dmu_objset_create_sync, pdd, &oa, 5); + dsl_dir_close(pdd, FTAG); + return (err); } -/* - * This will close the objset. - */ int -dmu_objset_rollback(objset_t *os) +dmu_objset_destroy(const char *name, boolean_t defer) { - int err; dsl_dataset_t *ds; + int error; - ds = os->os->os_dsl_dataset; - - if (!dsl_dataset_tryown(ds, TRUE, os)) { - dmu_objset_close(os); - return (EBUSY); + error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_destroy(ds, FTAG, defer); + /* dsl_dataset_destroy() closes the ds. */ } - err = dsl_dataset_rollback(ds, os->os->os_phys->os_type); - - /* - * NB: we close the objset manually because the rollback - * actually implicitly called dmu_objset_evict(), thus freeing - * the objset_impl_t. - */ - dsl_dataset_disown(ds, os); - kmem_free(os, sizeof (objset_t)); - return (err); + return (error); } struct snaparg { dsl_sync_task_group_t *dstg; char *snapname; + char *htag; char failed[MAXPATHLEN]; - boolean_t checkperms; + boolean_t recursive; + boolean_t needsuspend; + boolean_t temporary; nvlist_t *props; + struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ + dsl_dataset_t *newds; }; static int @@ -747,77 +817,137 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { objset_t *os = arg1; struct snaparg *sn = arg2; + int error; /* The props have already been checked by zfs_check_userprops(). */ - return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset, - sn->snapname, tx)); + error = dsl_dataset_snapshot_check(os->os_dsl_dataset, + sn->snapname, tx); + if (error) + return (error); + + if (sn->temporary) { + /* + * Ideally we would just call + * dsl_dataset_user_hold_check() and + * dsl_dataset_destroy_check() here. However the + * dataset we want to hold and destroy is the snapshot + * that we just confirmed we can create, but it won't + * exist until after these checks are run. Do any + * checks we can here and if more checks are added to + * those routines in the future, similar checks may be + * necessary here. + */ + if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + /* + * Not checking number of tags because the tag will be + * unique, as it will be the only tag. + */ + if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (E2BIG); + + sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + sn->ha->temphold = B_TRUE; + sn->ha->htag = sn->htag; + } + return (error); } static void -snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { objset_t *os = arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; struct snaparg *sn = arg2; - dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx); + dsl_dataset_snapshot_sync(ds, sn->snapname, tx); + + if (sn->props) { + dsl_props_arg_t pa; + pa.pa_props = sn->props; + pa.pa_source = ZPROP_SRC_LOCAL; + dsl_props_set_sync(ds->ds_prev, &pa, tx); + } + + if (sn->temporary) { + struct dsl_ds_destroyarg da; + + dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); + kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); + sn->ha = NULL; + sn->newds = ds->ds_prev; - if (sn->props) - dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx); + da.ds = ds->ds_prev; + da.defer = B_TRUE; + dsl_dataset_destroy_sync(&da, FTAG, tx); + } } static int -dmu_objset_snapshot_one(char *name, void *arg) +dmu_objset_snapshot_one(const char *name, void *arg) { struct snaparg *sn = arg; objset_t *os; int err; + char *cp; + + /* + * If the objset starts with a '%', then ignore it unless it was + * explicitly named (ie, not recursive). These hidden datasets + * are always inconsistent, and by not opening them here, we can + * avoid a race with dsl_dir_destroy_check(). + */ + cp = strrchr(name, '/'); + if (cp && cp[1] == '%' && sn->recursive) + return (0); (void) strcpy(sn->failed, name); /* - * Check permissions only when requested. This only applies when - * doing a recursive snapshot. The permission checks for the starting - * dataset have already been performed in zfs_secpolicy_snapshot() + * Check permissions if we are doing a recursive snapshot. The + * permission checks for the starting dataset have already been + * performed in zfs_secpolicy_snapshot() */ - if (sn->checkperms == B_TRUE && - (err = zfs_secpolicy_snapshot_perms(name, CRED()))) + if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) return (err); - err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os); + err = dmu_objset_hold(name, sn, &os); if (err != 0) return (err); - /* If the objset is in an inconsistent state, return busy */ - if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { - dmu_objset_close(os); - return (EBUSY); - } - /* - * NB: we need to wait for all in-flight changes to get to disk, - * so that we snapshot those changes. zil_suspend does this as - * a side effect. + * If the objset is in an inconsistent state (eg, in the process + * of being destroyed), don't snapshot it. As with %hidden + * datasets, we return EBUSY if this name was explicitly + * requested (ie, not recursive), and otherwise ignore it. */ - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) { - dsl_sync_task_create(sn->dstg, snapshot_check, - snapshot_sync, os, sn, 3); - } else { - dmu_objset_close(os); + if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { + dmu_objset_rele(os, sn); + return (sn->recursive ? 0 : EBUSY); } - return (err); + if (sn->needsuspend) { + err = zil_suspend(dmu_objset_zil(os)); + if (err) { + dmu_objset_rele(os, sn); + return (err); + } + } + dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, + os, sn, 3); + + return (0); } int -dmu_objset_snapshot(char *fsname, char *snapname, - nvlist_t *props, boolean_t recursive) +dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) { dsl_sync_task_t *dst; struct snaparg sn; spa_t *spa; + minor_t minor; int err; (void) strcpy(sn.failed, fsname); @@ -826,16 +956,31 @@ dmu_objset_snapshot(char *fsname, char *snapname, if (err) return (err); + if (temporary) { + if (cleanup_fd < 0) { + spa_close(spa, FTAG); + return (EINVAL); + } + if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { + spa_close(spa, FTAG); + return (err); + } + } + sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; + sn.htag = tag; sn.props = props; + sn.recursive = recursive; + sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + sn.temporary = temporary; + sn.ha = NULL; + sn.newds = NULL; if (recursive) { - sn.checkperms = B_TRUE; err = dmu_objset_find(fsname, dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); } else { - sn.checkperms = B_FALSE; err = dmu_objset_snapshot_one(fsname, &sn); } @@ -845,15 +990,33 @@ dmu_objset_snapshot(char *fsname, char *snapname, for (dst = list_head(&sn.dstg->dstg_tasks); dst; dst = list_next(&sn.dstg->dstg_tasks, dst)) { objset_t *os = dst->dst_arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; - if (dst->dst_err) + dsl_dataset_t *ds = os->os_dsl_dataset; + if (dst->dst_err) { dsl_dataset_name(ds, sn.failed); - zil_resume(dmu_objset_zil(os)); - dmu_objset_close(os); + } else if (temporary) { + dsl_register_onexit_hold_cleanup(sn.newds, tag, minor); + } + if (sn.needsuspend) + zil_resume(dmu_objset_zil(os)); +#ifdef __FreeBSD__ +#ifdef _KERNEL + if (dst->dst_err == 0 && dmu_objset_type(os) == DMU_OST_ZVOL) { + char name[MAXNAMELEN]; + + dmu_objset_name(os, name); + strlcat(name, "@", sizeof(name)); + strlcat(name, snapname, sizeof(name)); + zvol_create_minors(name); + } +#endif +#endif + dmu_objset_rele(os, &sn); } if (err) (void) strcpy(fsname, sn.failed); + if (temporary) + zfs_onexit_fd_rele(cleanup_fd); dsl_sync_task_group_destroy(sn.dstg); spa_close(spa, FTAG); return (err); @@ -888,11 +1051,10 @@ dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) /* ARGSUSED */ static void -ready(zio_t *zio, arc_buf_t *abuf, void *arg) +dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - objset_impl_t *os = arg; + objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(bp == os->os_rootbp); @@ -908,24 +1070,34 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg) bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; +} + +/* ARGSUSED */ +static void +dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + ASSERT(BP_EQUAL(bp, bp_orig)); } else { - if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, zio, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); } } /* called from dsl */ void -dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) +dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_t zb; - writeprops_t wp = { 0 }; + zio_prop_t zp; zio_t *zio; list_t *list; list_t *newlist = NULL; @@ -949,42 +1121,33 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) /* * Create the root block IO */ - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; /* for block ordering; it's level 0 on disk */ - zb.zb_blkid = 0; - - wp.wp_type = DMU_OT_OBJSET; - wp.wp_level = 0; /* on-disk BP level; see above */ - wp.wp_copies = os->os_copies; - wp.wp_oschecksum = os->os_checksum; - wp.wp_oscompress = os->os_compress; - - if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) { - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - os->os_rootbp, pio, tx); - } + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, + os->os_rootbp, os->os_spa, &zb)); - arc_release(os->os_phys_buf, &os->os_phys_buf); + dmu_write_policy(os, NULL, 0, 0, &zp); - zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), - tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, + zio = arc_write(pio, os->os_spa, tx->tx_txg, + os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, + dmu_objset_write_ready, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ - os->os_meta_dnode->dn_zio = zio; - dnode_sync(os->os_meta_dnode, tx); + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; - if (os->os_userused_dnode && - os->os_userused_dnode->dn_type != DMU_OT_NONE) { - os->os_userused_dnode->dn_zio = zio; - dnode_sync(os->os_userused_dnode, tx); - os->os_groupused_dnode->dn_zio = zio; - dnode_sync(os->os_groupused_dnode, tx); + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; @@ -1002,7 +1165,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); - list = &os->os_meta_dnode->dn_dirty_records[txgoff]; + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { ASSERT(dr->dr_dbuf->db_level == 0); list_remove(list, dr); @@ -1017,6 +1180,22 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) zio_nowait(zio); } +boolean_t +dmu_objset_is_dirty(objset_t *os, uint64_t txg) +{ + return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || + !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); +} + +boolean_t +dmu_objset_is_dirty_anywhere(objset_t *os) +{ + for (int t = 0; t < TXG_SIZE; t++) + if (dmu_objset_is_dirty(os, t)) + return (B_TRUE); + return (B_FALSE); +} + static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void @@ -1026,74 +1205,86 @@ dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) } boolean_t -dmu_objset_userused_enabled(objset_impl_t *os) +dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] && - os->os_userused_dnode); + used_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); +} + +static void +do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) +{ + if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { + int64_t delta = DNODE_SIZE + used; + if (subtract) + delta = -delta; + VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, + user, delta, tx)); + VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, + group, delta, tx)); + } } void -dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx) +dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; - static const char zerobuf[DN_MAX_BONUSLEN] = {0}; ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); while (dn = list_head(list)) { - dmu_object_type_t bonustype; - + int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); - ASSERT(dn->dn_oldphys); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ - if (os->os_userused_dnode->dn_type == DMU_OT_NONE) { - VERIFY(0 == zap_create_claim(&os->os, + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { + VERIFY(0 == zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); - VERIFY(0 == zap_create_claim(&os->os, + VERIFY(0 == zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } /* - * If the object was not previously - * accounted, pretend that it was free. + * We intentionally modify the zap object even if the + * net delta is zero. Otherwise + * the block of the zap obj could be shared between + * datasets but need to be different between them after + * a bprewrite. */ - if (!(dn->dn_oldphys->dn_flags & - DNODE_FLAG_USERUSED_ACCOUNTED)) { - bzero(dn->dn_oldphys, sizeof (dnode_phys_t)); - } - /* - * If the object was freed, use the previous bonustype. - */ - bonustype = dn->dn_phys->dn_bonustype ? - dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype; - ASSERT(dn->dn_phys->dn_type != 0 || - (bcmp(DN_BONUS(dn->dn_phys), zerobuf, - DN_MAX_BONUSLEN) == 0 && - DN_USED_BYTES(dn->dn_phys) == 0)); - ASSERT(dn->dn_oldphys->dn_type != 0 || - (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf, - DN_MAX_BONUSLEN) == 0 && - DN_USED_BYTES(dn->dn_oldphys) == 0)); - used_cbs[os->os_phys->os_type](&os->os, bonustype, - DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys), - DN_USED_BYTES(dn->dn_oldphys), - DN_USED_BYTES(dn->dn_phys), tx); + flags = dn->dn_id_flags; + ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { + do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); + } + if (flags & DN_ID_NEW_EXIST) { + do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), + dn->dn_phys->dn_flags, dn->dn_newuid, + dn->dn_newgid, B_FALSE, tx); + } - /* - * The mutex is needed here for interlock with dnode_allocate. - */ mutex_enter(&dn->dn_mtx); - zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t)); - dn->dn_oldphys = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + dn->dn_olduid = dn->dn_newuid; + dn->dn_oldgid = dn->dn_newgid; + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (dn->dn_bonuslen == 0) + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + else + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); list_remove(list, dn); @@ -1101,10 +1292,151 @@ dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx) } } +/* + * Returns a pointer to data to find uid/gid from + * + * If a dirty record for transaction group that is syncing can't + * be found then NULL is returned. In the NULL case it is assumed + * the uid/gid aren't changing. + */ +static void * +dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr, **drp; + void *data; + + if (db->db_dirtycnt == 0) + return (db->db.db_data); /* Nothing is changing */ + + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg == tx->tx_txg) + break; + + if (dr == NULL) { + data = NULL; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + + return (data); +} + +void +dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + void *data = NULL; + dmu_buf_impl_t *db = NULL; + uint64_t *user, *group; + int flags = dn->dn_id_flags; + int error; + boolean_t have_spill = B_FALSE; + + if (!dmu_objset_userused_enabled(dn->dn_objset)) + return; + + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| + DN_ID_CHKED_SPILL))) + return; + + if (before && dn->dn_bonuslen != 0) + data = DN_BONUS(dn->dn_phys); + else if (!before && dn->dn_bonuslen != 0) { + if (dn->dn_bonus) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + data = dmu_objset_userquota_find_data(db, tx); + } else { + data = DN_BONUS(dn->dn_phys); + } + } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { + int rf = 0; + + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + error = dmu_spill_hold_by_dnode(dn, + rf | DB_RF_MUST_SUCCEED, + FTAG, (dmu_buf_t **)&db); + ASSERT(error == 0); + mutex_enter(&db->db_mtx); + data = (before) ? db->db.db_data : + dmu_objset_userquota_find_data(db, tx); + have_spill = B_TRUE; + } else { + mutex_enter(&dn->dn_mtx); + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + mutex_exit(&dn->dn_mtx); + return; + } + + if (before) { + ASSERT(data); + user = &dn->dn_olduid; + group = &dn->dn_oldgid; + } else if (data) { + user = &dn->dn_newuid; + group = &dn->dn_newgid; + } + + /* + * Must always call the callback in case the object + * type has changed and that type isn't an object type to track + */ + error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, + user, group); + + /* + * Preserve existing uid/gid when the callback can't determine + * what the new uid/gid are and the callback returned EEXIST. + * The EEXIST error tells us to just use the existing uid/gid. + * If we don't know what the old values are then just assign + * them to 0, since that is a new file being created. + */ + if (!before && data == NULL && error == EEXIST) { + if (flags & DN_ID_OLD_EXIST) { + dn->dn_newuid = dn->dn_olduid; + dn->dn_newgid = dn->dn_oldgid; + } else { + dn->dn_newuid = 0; + dn->dn_newgid = 0; + } + error = 0; + } + + if (db) + mutex_exit(&db->db_mtx); + + mutex_enter(&dn->dn_mtx); + if (error == 0 && before) + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (error == 0 && !before) + dn->dn_id_flags |= DN_ID_NEW_EXIST; + + if (have_spill) { + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + } else { + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + mutex_exit(&dn->dn_mtx); + if (have_spill) + dmu_buf_rele((dmu_buf_t *)db, FTAG); +} + boolean_t dmu_objset_userspace_present(objset_t *os) { - return (os->os->os_phys->os_flags & + return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } @@ -1116,7 +1448,7 @@ dmu_objset_userspace_upgrade(objset_t *os) if (dmu_objset_userspace_present(os)) return (0); - if (!dmu_objset_userused_enabled(os->os)) + if (!dmu_objset_userused_enabled(os)) return (ENOTSUP); if (dmu_objset_is_snapshot(os)) return (EINVAL); @@ -1152,7 +1484,7 @@ dmu_objset_userspace_upgrade(objset_t *os) dmu_tx_commit(tx); } - os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } @@ -1161,35 +1493,35 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp, + dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { - return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset)); + return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { - stat->dds_type = os->os->os_phys->os_type; - if (os->os->os_dsl_dataset) - dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat); + stat->dds_type = os->os_phys->os_type; + if (os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { - ASSERT(os->os->os_dsl_dataset || - os->os->os_phys->os_type == DMU_OST_META); + ASSERT(os->os_dsl_dataset || + os->os_phys->os_type == DMU_OST_META); - if (os->os->os_dsl_dataset != NULL) - dsl_dataset_stats(os->os->os_dsl_dataset, nv); + if (os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, - os->os->os_phys->os_type); + os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } @@ -1197,8 +1529,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv) int dmu_objset_is_snapshot(objset_t *os) { - if (os->os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset)); + if (os->os_dsl_dataset != NULL) + return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } @@ -1207,7 +1539,7 @@ int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (ds->ds_phys->ds_snapnames_zapobj == 0) @@ -1222,7 +1554,7 @@ int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; @@ -1259,12 +1591,12 @@ int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { - dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir; + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ - if (os->os->os_dsl_dataset->ds_object != + if (os->os_dsl_dataset->ds_object != dd->dd_phys->dd_head_dataset_obj) return (ENOENT); @@ -1293,7 +1625,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, } struct findarg { - int (*func)(char *, void *); + int (*func)(const char *, void *); void *arg; }; @@ -1302,7 +1634,7 @@ static int findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { struct findarg *fa = arg; - return (fa->func((char *)dsname, fa->arg)); + return (fa->func(dsname, fa->arg)); } /* @@ -1310,7 +1642,8 @@ findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) * Perhaps change all callers to use dmu_objset_find_spa()? */ int -dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) +dmu_objset_find(const char *name, int func(const char *, void *), void *arg, + int flags) { struct findarg fa; fa.func = func; @@ -1361,12 +1694,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name, ASSERT(attr->za_integer_length == sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strcpy(child, name); - (void) strcat(child, "/"); - (void) strcat(child, attr->za_name); + child = kmem_asprintf("%s/%s", name, attr->za_name); err = dmu_objset_find_spa(spa, child, func, arg, flags); - kmem_free(child, MAXPATHLEN); + strfree(child); if (err) break; } @@ -1400,13 +1730,11 @@ dmu_objset_find_spa(spa_t *spa, const char *name, sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strcpy(child, name); - (void) strcat(child, "@"); - (void) strcat(child, attr->za_name); + child = kmem_asprintf("%s@%s", + name, attr->za_name); err = func(spa, attr->za_first_integer, child, arg); - kmem_free(child, MAXPATHLEN); + strfree(child); if (err) break; } @@ -1429,7 +1757,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name, /* ARGSUSED */ int -dmu_objset_prefetch(char *name, void *arg) +dmu_objset_prefetch(const char *name, void *arg) { dsl_dataset_t *ds; @@ -1438,16 +1766,14 @@ dmu_objset_prefetch(char *name, void *arg) if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { mutex_enter(&ds->ds_opening_lock); - if (!dsl_dataset_get_user_ptr(ds)) { + if (ds->ds_objset == NULL) { uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; - zb.zb_objset = ds->ds_object; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; + SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds), + (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), &ds->ds_phys->ds_bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, @@ -1463,13 +1789,13 @@ dmu_objset_prefetch(char *name, void *arg) void dmu_objset_set_user(objset_t *os, void *user_ptr) { - ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); - os->os->os_user_ptr = user_ptr; + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { - ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); - return (os->os->os_user_ptr); + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + return (os->os_user_ptr); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index ed5afb4..55451fd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -33,14 +32,32 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include +#include static char *dmu_recv_tag = "dmu_recv_tag"; +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free() and + * dump_freeobjects() can be aggregated into a single DRR_FREE or + * DRR_FREEOBJECTS replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS +} pendop_t; + struct backuparg { dmu_replay_record_t *drr; kthread_t *td; @@ -48,7 +65,9 @@ struct backuparg { offset_t *off; objset_t *os; zio_cksum_t zc; + uint64_t toguid; int err; + pendop_t pending_op; }; static int @@ -56,11 +75,9 @@ dump_bytes(struct backuparg *ba, void *buf, int len) { struct uio auio; struct iovec aiov; - ASSERT3U(len % 8, ==, 0); fletcher_4_incremental_native(buf, len, &ba->zc); - aiov.iov_base = buf; aiov.iov_len = len; auio.uio_iov = &aiov; @@ -79,7 +96,6 @@ dump_bytes(struct backuparg *ba, void *buf, int len) ba->err = EOPNOTSUPP; #endif *ba->off += len; - return (ba->err); } @@ -87,29 +103,120 @@ static int dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, uint64_t length) { - /* write a FREE record */ + struct drr_free *drrf = &(ba->drr->drr_u.drr_free); + + /* + * If there is a pending op, but it's not PENDING_FREE, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_FREE records can only be aggregated with + * other DRR_FREE records. DRR_FREEOBJECTS records can only be + * aggregated with other DRR_FREEOBJECTS records. + */ + if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + + if (ba->pending_op == PENDING_FREE) { + /* + * There should never be a PENDING_FREE if length is -1 + * (because dump_dnode is the only place where this + * function is called with a -1, and only after flushing + * any pending record). + */ + ASSERT(length != -1ULL); + /* + * Check to see whether this free block can be aggregated + * with pending one. + */ + if (drrf->drr_object == object && drrf->drr_offset + + drrf->drr_length == offset) { + drrf->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_bytes(ba, ba->drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + } + /* create a FREE record and make it pending */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_FREE; - ba->drr->drr_u.drr_free.drr_object = object; - ba->drr->drr_u.drr_free.drr_offset = offset; - ba->drr->drr_u.drr_free.drr_length = length; + drrf->drr_object = object; + drrf->drr_offset = offset; + drrf->drr_length = length; + drrf->drr_toguid = ba->toguid; + if (length == -1ULL) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + } else { + ba->pending_op = PENDING_FREE; + } - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); return (0); } static int dump_data(struct backuparg *ba, dmu_object_type_t type, - uint64_t object, uint64_t offset, int blksz, void *data) + uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { + struct drr_write *drrw = &(ba->drr->drr_u.drr_write); + + + /* + * If there is any kind of pending aggregation (currently either + * a grouping of free objects or free blocks), push it out to + * the stream, since aggregation can't be done across operations + * of different types. + */ + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } /* write a DATA record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_WRITE; - ba->drr->drr_u.drr_write.drr_object = object; - ba->drr->drr_u.drr_write.drr_type = type; - ba->drr->drr_u.drr_write.drr_offset = offset; - ba->drr->drr_u.drr_write.drr_length = blksz; + drrw->drr_object = object; + drrw->drr_type = type; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = ba->toguid; + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) + drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + if (dump_bytes(ba, data, blksz) != 0) + return (EINTR); + return (0); +} + +static int +dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) +{ + struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); + + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + + /* write a SPILL record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_SPILL; + drrs->drr_object = object; + drrs->drr_length = blksz; + drrs->drr_toguid = ba->toguid; if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) return (EINTR); @@ -121,39 +228,80 @@ dump_data(struct backuparg *ba, dmu_object_type_t type, static int dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) { + struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); + + /* + * If there is a pending op, but it's not PENDING_FREEOBJECTS, + * push it out, since free block aggregation can only be done for + * blocks of the same type (i.e., DRR_FREE records can only be + * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records + * can only be aggregated with other DRR_FREEOBJECTS records. + */ + if (ba->pending_op != PENDING_NONE && + ba->pending_op != PENDING_FREEOBJECTS) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + if (ba->pending_op == PENDING_FREEOBJECTS) { + /* + * See whether this free object array can be aggregated + * with pending one + */ + if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { + drrfo->drr_numobjs += numobjs; + return (0); + } else { + /* can't be aggregated. Push out pending record */ + if (dump_bytes(ba, ba->drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + } + /* write a FREEOBJECTS record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_FREEOBJECTS; - ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; - ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; + drrfo->drr_firstobj = firstobj; + drrfo->drr_numobjs = numobjs; + drrfo->drr_toguid = ba->toguid; + + ba->pending_op = PENDING_FREEOBJECTS; - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); return (0); } static int dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) { + struct drr_object *drro = &(ba->drr->drr_u.drr_object); + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(ba, object, 1)); + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + /* write an OBJECT record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_OBJECT; - ba->drr->drr_u.drr_object.drr_object = object; - ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; - ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; - ba->drr->drr_u.drr_object.drr_blksz = - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; - ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; - ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + drro->drr_object = object; + drro->drr_type = dnp->dn_type; + drro->drr_bonustype = dnp->dn_bonustype; + drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + drro->drr_bonuslen = dnp->dn_bonuslen; + drro->drr_checksumtype = dnp->dn_checksum; + drro->drr_compress = dnp->dn_compress; + drro->drr_toguid = ba->toguid; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); - if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) + if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (EINTR); /* free anything past the end of the file */ @@ -169,9 +317,10 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +/* ARGSUSED */ static int -backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct backuparg *ba = arg; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; @@ -180,9 +329,10 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); - if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { + if (zb->zb_object != DMU_META_DNODE_OBJECT && + DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (bp == NULL && zb->zb_object == 0) { + } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); @@ -198,7 +348,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; - if (arc_read_nolock(NULL, spa, bp, + if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); @@ -212,7 +362,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, break; } (void) arc_buf_remove_ref(abuf, &abuf); - } else { /* it's a level-0 block of a regular object */ + } else if (type == DMU_OT_SA) { uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); @@ -222,8 +372,20 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); + err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); + } else { /* it's a level-0 block of a regular object */ + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + int blksz = BP_GET_LSIZE(bp); + + if (dsl_read(NULL, spa, bp, pbuf, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, - blksz, abuf->b_data); + blksz, bp, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -235,8 +397,8 @@ int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, struct file *fp, offset_t *off) { - dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; dmu_replay_record_t *drr; struct backuparg ba; int err; @@ -273,10 +435,25 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION; + DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, + DMU_SUBSTREAM); + +#ifdef _KERNEL + if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { + uint64_t version; + if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) + return (EINVAL); + if (version == ZPL_VERSION_SA) { + DMU_SET_FEATUREFLAGS( + drr->drr_u.drr_begin.drr_versioninfo, + DMU_BACKUP_FEATURE_SA_SPILL); + } + } +#endif + drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; - drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; if (fromorigin) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; @@ -297,9 +474,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, ba.fp = fp; ba.os = tosnap; ba.off = off; + ba.toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); + ba.pending_op = PENDING_NONE; - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); return (ba.err); } @@ -307,6 +486,10 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, &ba); + if (ba.pending_op != PENDING_NONE) + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) + err = EINTR; + if (err) { if (err == EINTR && ba.err) err = ba.err; @@ -317,8 +500,9 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = ba.zc; + drr->drr_u.drr_end.drr_toguid = ba.toguid; - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); return (ba.err); } @@ -339,33 +523,12 @@ struct recvbeginsyncarg { uint64_t dsflags; char clonelastname[MAXNAMELEN]; dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ + cred_t *cr; }; -static dsl_dataset_t * -recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type, - cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds; - - /* This should always work, since we just created it */ - /* XXX - create should return an owned ds */ - VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, - DS_MODE_INCONSISTENT, dmu_recv_tag, &ds)); - - if (type != DMU_OST_NONE) { - (void) dmu_objset_create_impl(dp->dp_spa, - ds, &ds->ds_phys->ds_bp, type, tx); - } - - spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, - dp->dp_spa, tx, cr, "dataset = %lld", dsobj); - - return (ds); -} - /* ARGSUSED */ static int -recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -383,7 +546,7 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) /* make sure it's a snap in the same pool */ if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) return (EXDEV); - if (rbsa->origin->ds_phys->ds_num_children == 0) + if (!dsl_dataset_is_snapshot(rbsa->origin)) return (EINVAL); if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) return (ENODEV); @@ -393,77 +556,31 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; + /* Create and open new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, - rbsa->origin, flags, cr, tx); - - rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, - rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); -} + rbsa->origin, flags, rbsa->cr, tx); + VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, + B_TRUE, dmu_recv_tag, &rbsa->ds)); -static int -recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - int err; - - /* must be a head ds */ - if (ds->ds_phys->ds_next_snap_obj != 0) - return (EINVAL); - - /* must not be a clone ds */ - if (dsl_dir_is_clone(ds->ds_dir)) - return (EINVAL); - - err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); - if (err) - return (err); - - if (rbsa->origin) { - /* make sure it's a snap in the same pool */ - if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool) - return (EXDEV); - if (rbsa->origin->ds_phys->ds_num_children == 0) - return (EINVAL); - if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); + if (rbsa->origin == NULL) { + (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, + rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); } - return (0); -} - -static void -recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - dsl_dir_t *dd = ds->ds_dir; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; - uint64_t dsobj; - - /* - * NB: caller must provide an extra hold on the dsl_dir_t, so it - * won't go away when dsl_dataset_destroy_sync() closes the - * dataset. - */ - dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); - - dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx); - - rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, - rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); + spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, + dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); } /* ARGSUSED */ static int -recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -474,77 +591,105 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) return (ETXTBSY); - /* must already be a snapshot of this fs */ - if (ds->ds_phys->ds_prev_snap_obj == 0) - return (ENODEV); - - /* most recent snapshot must match fromguid */ - if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); - - /* temporary clone name must not exist */ + /* new snapshot name must not exist */ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_dir->dd_phys->dd_child_dir_zapobj, - rbsa->clonelastname, 8, 1, &val); + ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); if (err == 0) return (EEXIST); if (err != ENOENT) return (err); - /* new snapshot name must not exist */ + if (rbsa->fromguid) { + /* if incremental, most recent snapshot must match fromguid */ + if (ds->ds_prev == NULL) + return (ENODEV); + + /* + * most recent snapshot must match fromguid, or there are no + * changes since the fromguid one + */ + if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { + uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; + uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; + while (obj != 0) { + dsl_dataset_t *snap; + err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + obj, FTAG, &snap); + if (err) + return (ENODEV); + if (snap->ds_phys->ds_creation_txg < birth) { + dsl_dataset_rele(snap, FTAG); + return (ENODEV); + } + if (snap->ds_phys->ds_guid == rbsa->fromguid) { + dsl_dataset_rele(snap, FTAG); + break; /* it's ok */ + } + obj = snap->ds_phys->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + } + if (obj == 0) + return (ENODEV); + } + } else { + /* if full, most recent snapshot must be $ORIGIN */ + if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) + return (ENODEV); + } + + /* temporary clone name must not exist */ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); + ds->ds_dir->dd_phys->dd_child_dir_zapobj, + rbsa->clonelastname, 8, 1, &val); if (err == 0) return (EEXIST); if (err != ENOENT) return (err); + return (0); } /* ARGSUSED */ static void -recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ohds = arg1; struct recvbeginsyncarg *rbsa = arg2; dsl_pool_t *dp = ohds->ds_dir->dd_pool; - dsl_dataset_t *ods, *cds; + dsl_dataset_t *cds; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; - /* create the temporary clone */ - VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj, - FTAG, &ods)); - dsobj = dsl_dataset_create_sync(ohds->ds_dir, - rbsa->clonelastname, ods, flags, cr, tx); - dsl_dataset_rele(ods, FTAG); - - /* open the temporary clone */ - VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, - DS_MODE_INCONSISTENT, dmu_recv_tag, &cds)); + /* create and open the temporary clone */ + dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, + ohds->ds_prev, flags, rbsa->cr, tx); + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); - /* copy the refquota from the target fs to the clone */ - if (ohds->ds_quota > 0) - dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); + /* + * If we actually created a non-clone, we need to create the + * objset in our new dataset. + */ + if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { + (void) dmu_objset_create_impl(dp->dp_spa, + cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); + } rbsa->ds = cds; - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - dp->dp_spa, tx, cr, "dataset = %lld", dsobj); + spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, "dataset = %lld", dsobj); } -/* ARGSUSED */ -static void -recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +static boolean_t +dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) { - dsl_dataset_t *ds = arg1; + int featureflags; - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", - ds->ds_object); + /* Verify pool version supports SA if SA_SPILL feature set */ + return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); } /* @@ -552,13 +697,13 @@ recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, + boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) { int err = 0; boolean_t byteswap; - struct recvbeginsyncarg rbsa; - uint64_t version; + struct recvbeginsyncarg rbsa = { 0 }; + uint64_t versioninfo; int flags; dsl_dataset_t *ds; @@ -571,22 +716,23 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, rbsa.tofs = tofs; rbsa.tosnap = tosnap; - rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL; + rbsa.origin = origin ? origin->os_dsl_dataset : NULL; rbsa.fromguid = drrb->drr_fromguid; rbsa.type = drrb->drr_type; rbsa.tag = FTAG; rbsa.dsflags = 0; - version = drrb->drr_version; + rbsa.cr = CRED(); + versioninfo = drrb->drr_versioninfo; flags = drrb->drr_flags; if (byteswap) { rbsa.type = BSWAP_32(rbsa.type); rbsa.fromguid = BSWAP_64(rbsa.fromguid); - version = BSWAP_64(version); + versioninfo = BSWAP_64(versioninfo); flags = BSWAP_32(flags); } - if (version != DMU_BACKUP_STREAM_VERSION || + if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || rbsa.type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && origin == NULL)) return (EINVAL); @@ -597,102 +743,81 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drrb = drrb; drc->drc_tosnap = tosnap; + drc->drc_top_ds = top_ds; drc->drc_force = force; /* * Process the begin in syncing context. */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { - /* offline incremental receive */ - err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds); - if (err) - return (err); - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - rbsa.fromguid) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + /* open the dataset we are logically receiving into */ + err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); + if (err == 0) { + if (dmu_recv_verify_features(ds, drrb)) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (ENOTSUP); } - rbsa.force = B_FALSE; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_offline_incremental_sync, ds, &rbsa, 1); - if (err) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (err); + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EINVAL); + } + + /* must not have an incremental recv already in progress */ + if (!mutex_tryenter(&ds->ds_recvlock)) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EBUSY); } - drc->drc_logical_ds = drc->drc_real_ds = ds; - } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { - /* online incremental receive */ /* tmp clone name is: tofs/%tosnap" */ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), "%%%s", tosnap); - - /* open the dataset we are logically receiving into */ - err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); - if (err) - return (err); - rbsa.force = force; err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_online_incremental_sync, ds, &rbsa, 5); + recv_existing_check, recv_existing_sync, ds, &rbsa, 5); if (err) { + mutex_exit(&ds->ds_recvlock); dsl_dataset_rele(ds, dmu_recv_tag); return (err); } drc->drc_logical_ds = ds; drc->drc_real_ds = rbsa.ds; - } else { - /* create new fs -- full backup or clone */ - dsl_dir_t *dd = NULL; - const char *tail; + } else if (err == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char *cp; - err = dsl_dir_open(tofs, FTAG, &dd, &tail); + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) + return (ENOENT); + + /* Open the parent of tofs */ + cp = strrchr(tofs, '/'); + *cp = '\0'; + err = dsl_dataset_hold(tofs, FTAG, &ds); + *cp = '/'; if (err) return (err); - if (tail == NULL) { - if (!force) { - dsl_dir_close(dd, FTAG); - return (EEXIST); - } - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_dataset_own_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, - DS_MODE_INCONSISTENT, FTAG, &ds); - rw_exit(&dd->dd_pool->dp_config_rwlock); - if (err) { - dsl_dir_close(dd, FTAG); - return (err); - } - dsl_dataset_make_exclusive(ds, FTAG); - err = dsl_sync_task_do(dd->dd_pool, - recv_full_existing_check, - recv_full_existing_sync, ds, &rbsa, 5); - dsl_dataset_disown(ds, FTAG); - } else { - err = dsl_sync_task_do(dd->dd_pool, recv_full_check, - recv_full_sync, dd, &rbsa, 5); + if (dmu_recv_verify_features(ds, drrb)) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); } - dsl_dir_close(dd, FTAG); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); + dsl_dataset_rele(ds, FTAG); if (err) return (err); drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; drc->drc_newfs = B_TRUE; } - return (0); + return (err); } struct restorearg { @@ -704,10 +829,100 @@ struct restorearg { uint64_t voff; int bufsize; /* amount of memory allocated for buf */ zio_cksum_t cksum; + avl_tree_t *guid_to_ds_map; }; +typedef struct guid_map_entry { + uint64_t guid; + dsl_dataset_t *gme_ds; + avl_node_t avlnode; +} guid_map_entry_t; + static int -restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid) +guid_compare(const void *arg1, const void *arg2) +{ + const guid_map_entry_t *gmep1 = arg1; + const guid_map_entry_t *gmep2 = arg2; + + if (gmep1->guid < gmep2->guid) + return (-1); + else if (gmep1->guid > gmep2->guid) + return (1); + return (0); +} + +/* + * This function is a callback used by dmu_objset_find() (which + * enumerates the object sets) to build an avl tree that maps guids + * to datasets. The resulting table is used when processing DRR_WRITE_BYREF + * send stream records. These records, which are used in dedup'ed + * streams, do not contain data themselves, but refer to a copy + * of the data block that has already been written because it was + * earlier in the stream. That previous copy is identified by the + * guid of the dataset with the referenced data. + */ +int +find_ds_by_guid(const char *name, void *arg) +{ + avl_tree_t *guid_map = arg; + dsl_dataset_t *ds, *snapds; + guid_map_entry_t *gmep; + dsl_pool_t *dp; + int err; + uint64_t lastobj, firstobj; + + if (dsl_dataset_hold(name, FTAG, &ds) != 0) + return (0); + + dp = ds->ds_dir->dd_pool; + rw_enter(&dp->dp_config_rwlock, RW_READER); + firstobj = ds->ds_dir->dd_phys->dd_origin_obj; + lastobj = ds->ds_phys->ds_prev_snap_obj; + + while (lastobj != firstobj) { + err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); + if (err) { + /* + * Skip this snapshot and move on. It's not + * clear why this would ever happen, but the + * remainder of the snapshot streadm can be + * processed. + */ + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + return (0); + } + + gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); + gmep->guid = snapds->ds_phys->ds_guid; + gmep->gme_ds = snapds; + avl_add(guid_map, gmep); + lastobj = snapds->ds_phys->ds_prev_snap_obj; + } + + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + + return (0); +} + +static void +free_guid_map_onexit(void *arg) +{ + avl_tree_t *ca = arg; + void *cookie = NULL; + guid_map_entry_t *gmep; + + while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { + dsl_dataset_rele(gmep->gme_ds, ca); + kmem_free(gmep, sizeof (guid_map_entry_t)); + } + avl_destroy(ca); + kmem_free(ca, sizeof (avl_tree_t)); +} + +static int +restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) { struct uio auio; struct iovec aiov; @@ -742,7 +957,7 @@ restore_read(struct restorearg *ra, int len) ASSERT3U(len % 8, ==, 0); while (done < len) { - int resid; + ssize_t resid; ra->err = restore_bytes(ra, (caddr_t)ra->buf + done, len - done, ra->voff, &resid); @@ -774,7 +989,7 @@ backup_byteswap(dmu_replay_record_t *drr) switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); - DO64(drr_begin.drr_version); + DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); @@ -788,27 +1003,56 @@ backup_byteswap(dmu_replay_record_t *drr) DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); + DO64(drr_object.drr_toguid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); + DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_length); + DO64(drr_write.drr_toguid); + DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); + DO64(drr_write.drr_key.ddk_prop); + break; + case DRR_WRITE_BYREF: + DO64(drr_write_byref.drr_object); + DO64(drr_write_byref.drr_offset); + DO64(drr_write_byref.drr_length); + DO64(drr_write_byref.drr_toguid); + DO64(drr_write_byref.drr_refguid); + DO64(drr_write_byref.drr_refobject); + DO64(drr_write_byref.drr_refoffset); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); + DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); + DO64(drr_free.drr_toguid); + break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); break; case DRR_END: DO64(drr_end.drr_checksum.zc_word[0]); DO64(drr_end.drr_checksum.zc_word[1]); DO64(drr_end.drr_checksum.zc_word[2]); DO64(drr_end.drr_checksum.zc_word[3]); + DO64(drr_end.drr_toguid); break; } #undef DO64 @@ -825,7 +1069,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) if (drro->drr_type == DMU_OT_NONE || drro->drr_type >= DMU_OT_NUMTYPES || drro->drr_bonustype >= DMU_OT_NUMTYPES || - drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || @@ -864,8 +1108,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen); } - if (err) + if (err) { return (EINVAL); + } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, drro->drr_object); @@ -875,7 +1120,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) return (err); } - dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); + dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, + tx); dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { @@ -957,6 +1203,114 @@ restore_write(struct restorearg *ra, objset_t *os, return (0); } +/* + * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed + * streams to refer to a copy of the data that is already on the + * system because it came in earlier in the stream. This function + * finds the earlier copy of the data, and uses that copy instead of + * data from the stream to fulfill this write. + */ +static int +restore_write_byref(struct restorearg *ra, objset_t *os, + struct drr_write_byref *drrwbr) +{ + dmu_tx_t *tx; + int err; + guid_map_entry_t gmesrch; + guid_map_entry_t *gmep; + avl_index_t where; + objset_t *ref_os = NULL; + dmu_buf_t *dbp; + + if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) + return (EINVAL); + + /* + * If the GUID of the referenced dataset is different from the + * GUID of the target dataset, find the referenced dataset. + */ + if (drrwbr->drr_toguid != drrwbr->drr_refguid) { + gmesrch.guid = drrwbr->drr_refguid; + if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, + &where)) == NULL) { + return (EINVAL); + } + if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) + return (EINVAL); + } else { + ref_os = os; + } + + if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, + drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) + return (err); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + dmu_write(os, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + dmu_buf_rele(dbp, FTAG); + dmu_tx_commit(tx); + return (0); +} + +static int +restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) +{ + dmu_tx_t *tx; + void *data; + dmu_buf_t *db, *db_spill; + int err; + + if (drrs->drr_length < SPA_MINBLOCKSIZE || + drrs->drr_length > SPA_MAXBLOCKSIZE) + return (EINVAL); + + data = restore_read(ra, drrs->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrs->drr_object, NULL) != 0) + return (EINVAL); + + VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); + if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_spill(tx, db->db_object); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_tx_abort(tx); + return (err); + } + dmu_buf_will_dirty(db_spill, tx); + + if (db_spill->db_size < drrs->drr_length) + VERIFY(0 == dbuf_spill_set_blksz(db_spill, + drrs->drr_length, tx)); + bcopy(data, db_spill->db_data, drrs->drr_length); + + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + + dmu_tx_commit(tx); + return (0); +} + /* ARGSUSED */ static int restore_free(struct restorearg *ra, objset_t *os, @@ -976,37 +1330,18 @@ restore_free(struct restorearg *ra, objset_t *os, return (err); } -void -dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) -{ - if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { - /* - * online incremental or new fs: destroy the fs (which - * may be a clone) that we created - */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); - if (drc->drc_real_ds != drc->drc_logical_ds) - dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); - } else { - /* - * offline incremental: rollback to most recent snapshot. - */ - (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE); - dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag); - } -} - /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ int -dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp) +dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep) { - kthread_t *td = curthread; struct restorearg ra = { 0 }; dmu_replay_record_t *drr; objset_t *os; zio_cksum_t pcksum; + int featureflags; if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) ra.byteswap = TRUE; @@ -1031,30 +1366,69 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp) if (ra.byteswap) { struct drr_begin *drrb = drc->drc_drrb; drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_version = BSWAP_64(drrb->drr_version); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } - ra.td = td; + ra.td = curthread; ra.fp = fp; ra.voff = *voffp; ra.bufsize = 1<<20; ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ - ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION); + ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == + DMU_SUBSTREAM); ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ - VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0); + VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + + /* if this stream is dedup'ed, set up the avl tree for guid mapping */ + if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { + minor_t minor; + + if (cleanup_fd == -1) { + ra.err = EBADF; + goto out; + } + ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (ra.err) { + cleanup_fd = -1; + goto out; + } + + if (*action_handlep == 0) { + ra.guid_to_ds_map = + kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(ra.guid_to_ds_map, guid_compare, + sizeof (guid_map_entry_t), + offsetof(guid_map_entry_t, avlnode)); + (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, + (void *)ra.guid_to_ds_map, + DS_FIND_CHILDREN); + ra.err = zfs_onexit_add_cb(minor, + free_guid_map_onexit, ra.guid_to_ds_map, + action_handlep); + if (ra.err) + goto out; + } else { + ra.err = zfs_onexit_cb_data(minor, *action_handlep, + (void **)&ra.guid_to_ds_map); + if (ra.err) + goto out; + } + } + /* * Read records and process them. */ @@ -1094,6 +1468,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp) ra.err = restore_write(&ra, os, &drrw); break; } + case DRR_WRITE_BYREF: + { + struct drr_write_byref drrwbr = + drr->drr_u.drr_write_byref; + ra.err = restore_write_byref(&ra, os, &drrwbr); + break; + } case DRR_FREE: { struct drr_free drrf = drr->drr_u.drr_free; @@ -1112,6 +1493,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp) ra.err = ECKSUM; goto out; } + case DRR_SPILL: + { + struct drr_spill drrs = drr->drr_u.drr_spill; + ra.err = restore_spill(&ra, os, &drrs); + break; + } default: ra.err = EINVAL; goto out; @@ -1121,15 +1508,22 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp) ASSERT(ra.err != 0); out: - dmu_objset_close(os); + if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) + zfs_onexit_fd_rele(cleanup_fd); if (ra.err != 0) { /* - * rollback or destroy what we created, so we don't - * leave it in the restoring state. + * destroy what we created, so we don't leave it in the + * inconsistent restoring state. */ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - dmu_recv_abort_cleanup(drc); + + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + if (drc->drc_real_ds != drc->drc_logical_ds) { + mutex_exit(&drc->drc_logical_ds->ds_recvlock); + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } } kmem_free(ra.buf, ra.bufsize); @@ -1153,12 +1547,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvendsyncarg *resa = arg2; - dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); @@ -1170,35 +1564,31 @@ recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; } -int -dmu_recv_end(dmu_recv_cookie_t *drc) +static int +dmu_recv_existing_end(dmu_recv_cookie_t *drc) { struct recvendsyncarg resa; dsl_dataset_t *ds = drc->drc_logical_ds; int err; /* - * XXX hack; seems the ds is still dirty and - * dsl_pool_zil_clean() expects it to have a ds_user_ptr - * (and zil), but clone_swap() can close it. + * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() + * expects it to have a ds_user_ptr (and zil), but clone_swap() + * can close it. */ txg_wait_synced(ds->ds_dir->dd_pool, 0); - if (ds != drc->drc_real_ds) { - /* we are doing an online recv */ - if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { - err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, - drc->drc_force); - if (err) - dsl_dataset_disown(ds, dmu_recv_tag); - } else { - err = EBUSY; - dsl_dataset_rele(ds, dmu_recv_tag); - } - /* dsl_dataset_destroy() will disown the ds */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { + err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, + drc->drc_force); if (err) - return (err); + goto out; + } else { + mutex_exit(&ds->ds_recvlock); + dsl_dataset_rele(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + return (EBUSY); } resa.creation_time = drc->drc_drrb->drr_creation_time; @@ -1208,16 +1598,52 @@ dmu_recv_end(dmu_recv_cookie_t *drc) err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_end_check, recv_end_sync, ds, &resa, 3); if (err) { - if (drc->drc_newfs) { - ASSERT(ds == drc->drc_real_ds); - (void) dsl_dataset_destroy(ds, dmu_recv_tag); - return (err); - } else { - (void) dsl_dataset_rollback(ds, DMU_OST_NONE); - } + /* swap back */ + (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); } - /* release the hold from dmu_recv_begin */ +out: + mutex_exit(&ds->ds_recvlock); dsl_dataset_disown(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); return (err); } + +static int +dmu_recv_new_end(dmu_recv_cookie_t *drc) +{ + struct recvendsyncarg resa; + dsl_dataset_t *ds = drc->drc_logical_ds; + int err; + + /* + * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() + * expects it to have a ds_user_ptr (and zil), but clone_swap() + * can close it. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, ds, &resa, 3); + if (err) { + /* clean up the fs we just recv'd into */ + (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); + } else { + /* release the hold from dmu_recv_begin */ + dsl_dataset_disown(ds, dmu_recv_tag); + } + return (err); +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc) +{ + if (drc->drc_logical_ds != drc->drc_real_ds) + return (dmu_recv_existing_end(drc)); + else + return (dmu_recv_new_end(drc)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 89cbfad..023f90e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -33,17 +32,13 @@ #include #include #include +#include +#include #include -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} +int zfs_pd_blks_max = 100; -struct prefetch_data { +typedef struct prefetch_data { kmutex_t pd_mtx; kcondvar_t pd_cv; int pd_blks_max; @@ -51,47 +46,46 @@ struct prefetch_data { int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; -}; +} prefetch_data_t; -struct traverse_data { +typedef struct traverse_data { spa_t *td_spa; uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; int td_flags; - struct prefetch_data *td_pfd; + prefetch_data_t *td_pfd; blkptr_cb_t *td_func; void *td_arg; -}; +} traverse_data_t; -static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, +static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf, uint64_t objset, uint64_t object); -/* ARGSUSED */ -static void +static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { - struct traverse_data *td = arg; + traverse_data_t *td = arg; zbookmark_t zb; if (bp->blk_birth == 0) - return; + return (0); if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) - return; + return (0); + + SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - zb.zb_objset = td->td_objset; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); + + return (0); } -/* ARGSUSED */ -static void +static int traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { - struct traverse_data *td = arg; + traverse_data_t *td = arg; if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; @@ -99,28 +93,29 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) zbookmark_t zb; if (bp->blk_birth == 0) - return; + return (0); if (claim_txg == 0 || bp->blk_birth < claim_txg) - return; + return (0); + + SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - zb.zb_objset = td->td_objset; - zb.zb_object = lr->lr_foid; - zb.zb_level = BP_GET_LEVEL(bp); - zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); - VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, + td->td_arg); } + return (0); } static void -traverse_zil(struct traverse_data *td, zil_header_t *zh) +traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). + * replayed; plus, in read-only mode, blocks that are already stable. */ if (claim_txg == 0 && spa_writeable(td->td_spa)) return; @@ -134,16 +129,18 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh) } static int -traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, +traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; - int err = 0; + int err = 0, lasterr = 0; arc_buf_t *buf = NULL; - struct prefetch_data *pd = td->td_pfd; + prefetch_data_t *pd = td->td_pfd; + boolean_t hard = td->td_flags & TRAVERSE_HARD; if (bp->blk_birth == 0) { - err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, + td->td_arg); return (err); } @@ -163,7 +160,10 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, } if (td->td_flags & TRAVERSE_PRE) { - err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); if (err) return (err); } @@ -174,7 +174,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, + err = dsl_read(NULL, td->td_spa, bp, pbuf, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -187,15 +187,18 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, buf, cbp, &czb); - if (err) - break; + if (err) { + if (!hard) + break; + lasterr = err; + } } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, + err = dsl_read(NULL, td->td_spa, bp, pbuf, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -203,33 +206,43 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, /* recursively visitbp() blocks below this */ dnp = buf->b_data; - for (i = 0; i < epb && err == 0; i++, dnp++) { + for (i = 0; i < epb; i++, dnp++) { err = traverse_dnode(td, dnp, buf, zb->zb_objset, zb->zb_blkid * epb + i); - if (err) - break; + if (err) { + if (!hard) + break; + lasterr = err; + } } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; dnode_phys_t *dnp; - err = arc_read_nolock(NULL, td->td_spa, bp, + err = dsl_read_nolock(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); osp = buf->b_data; - traverse_zil(td, &osp->os_zil_header); - dnp = &osp->os_meta_dnode; - err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0); + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_META_DNODE_OBJECT); + if (err && hard) { + lasterr = err; + err = 0; + } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_userused_dnode; err = traverse_dnode(td, dnp, buf, zb->zb_objset, DMU_USERUSED_OBJECT); } + if (err && hard) { + lasterr = err; + err = 0; + } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_groupused_dnode; err = traverse_dnode(td, dnp, buf, zb->zb_objset, @@ -240,35 +253,54 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, if (buf) (void) arc_buf_remove_ref(buf, &buf); - if (err == 0 && (td->td_flags & TRAVERSE_POST)) - err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); + } - return (err); + return (err != 0 ? err : lasterr); } static int -traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, +traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf, uint64_t objset, uint64_t object) { - int j, err = 0; + int j, err = 0, lasterr = 0; zbookmark_t czb; + boolean_t hard = (td->td_flags & TRAVERSE_HARD); for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, buf, (blkptr_t *)&dnp->dn_blkptr[j], &czb); - if (err) - break; + if (err) { + if (!hard) + break; + lasterr = err; + } } - return (err); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, + object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_spill, &czb); + if (err) { + if (!hard) + return (err); + lasterr = err; + } + } + return (err != 0 ? err : lasterr); } /* ARGSUSED */ static int -traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, + void *arg) { - struct prefetch_data *pfd = arg; + prefetch_data_t *pfd = arg; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; ASSERT(pfd->pd_blks_fetched >= 0); @@ -276,7 +308,8 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, return (EINTR); if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) return (0); mutex_enter(&pfd->pd_mtx); @@ -286,7 +319,7 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); - (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, + (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); @@ -297,15 +330,16 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, static void traverse_prefetch_thread(void *arg) { - struct traverse_data *td_main = arg; - struct traverse_data td = *td_main; + traverse_data_t *td_main = arg; + traverse_data_t td = *td_main; zbookmark_t czb; td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; - SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0); + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); mutex_enter(&td_main->td_pfd->pd_mtx); @@ -319,16 +353,16 @@ traverse_prefetch_thread(void *arg) * in syncing context). */ static int -traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, +traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - struct traverse_data td; - struct prefetch_data pd = { 0 }; + traverse_data_t td; + prefetch_data_t pd = { 0 }; zbookmark_t czb; int err; td.td_spa = spa; - td.td_objset = objset; + td.td_objset = ds ? ds->ds_object : 0; td.td_rootbp = rootbp; td.td_min_txg = txg_start; td.td_func = func; @@ -336,17 +370,29 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, td.td_pfd = &pd; td.td_flags = flags; - pd.pd_blks_max = 100; + pd.pd_blks_max = zfs_pd_blks_max; pd.pd_flags = flags; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + /* See comment on ZIL traversal in dsl_scan_visitds. */ + if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { + objset_t *os; + + err = dmu_objset_from_ds(ds, &os); + if (err) + return (err); + + traverse_zil(&td, &os->os_zil_header); + } + if (!(flags & TRAVERSE_PREFETCH) || 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, &td, TQ_NOQUEUE)) pd.pd_exited = B_TRUE; - SET_BOOKMARK(&czb, objset, 0, -1, 0); + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); mutex_enter(&pd.pd_mtx); @@ -370,7 +416,7 @@ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object, + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); } @@ -378,43 +424,59 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, * NB: pool must not be changing on-disk (eg, from zdb or sync context). */ int -traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg) +traverse_pool(spa_t *spa, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) { - int err; + int err, lasterr = 0; uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; + boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ - err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), - 0, TRAVERSE_PRE, func, arg); + err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), + txg_start, flags, func, arg); if (err) return (err); /* visit each dataset */ - for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) { + for (obj = 1; err == 0 || (err != ESRCH && hard); + err = dmu_object_next(mos, &obj, FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); - if (err) - return (err); + if (err) { + if (!hard) + return (err); + lasterr = err; + continue; + } if (doi.doi_type == DMU_OT_DSL_DATASET) { dsl_dataset_t *ds; + uint64_t txg = txg_start; + rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); - if (err) - return (err); - err = traverse_dataset(ds, - ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE, - func, arg); + if (err) { + if (!hard) + return (err); + lasterr = err; + continue; + } + if (ds->ds_phys->ds_prev_snap_txg > txg) + txg = ds->ds_phys->ds_prev_snap_txg; + err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); - if (err) - return (err); + if (err) { + if (!hard) + return (err); + lasterr = err; + } } } if (err == ESRCH) err = 0; - return (err); + return (err != 0 ? err : lasterr); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index b6a5cdb..81b8436 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -33,7 +32,10 @@ #include #include /* for fzap_default_block_shift */ #include +#include +#include #include +#include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); @@ -48,6 +50,8 @@ dmu_tx_create_dd(dsl_dir_t *dd) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); #ifdef ZFS_DEBUG refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); @@ -58,9 +62,9 @@ dmu_tx_create_dd(dsl_dir_t *dd) dmu_tx_t * dmu_tx_create(objset_t *os) { - dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); + dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; - tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); return (tx); } @@ -98,7 +102,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, int err; if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os->os, object, tx, &dn); + err = dnode_hold(os, object, tx, &dn); if (err) { tx->tx_err = err; return (NULL); @@ -161,38 +165,47 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) } static void -dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db, - boolean_t freeable, dmu_buf_impl_t **history) +dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, + int level, uint64_t blkid, boolean_t freeable, uint64_t *history) { - int i = db->db_level + 1; - dnode_t *dn = db->db_dnode; - - if (i >= dn->dn_nlevels) + objset_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent = NULL; + blkptr_t *bp = NULL; + uint64_t space; + + if (level >= dn->dn_nlevels || history[level] == blkid) return; - db = db->db_parent; - if (db == NULL) { - uint64_t lvls = dn->dn_nlevels - i; + history[level] = blkid; - txh->txh_space_towrite += lvls << dn->dn_indblkshift; - return; + space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); + + if (db == NULL || db == dn->dn_dbuf) { + ASSERT(level != 0); + db = NULL; + } else { + ASSERT(DB_DNODE(db) == dn); + ASSERT(db->db_level == level); + ASSERT(db->db.db_size == space); + ASSERT(db->db_blkid == blkid); + bp = db->db_blkptr; + parent = db->db_parent; } - if (db != history[i]) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint64_t space = 1ULL << dn->dn_indblkshift; + freeable = (bp && (freeable || + dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); - freeable = (db->db_blkptr && (freeable || - dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth))); - if (freeable) - txh->txh_space_tooverwrite += space; - else - txh->txh_space_towrite += space; - if (db->db_blkptr) - txh->txh_space_tounref += space; - history[i] = db; - dmu_tx_count_indirects(txh, db, freeable, history); - } + if (freeable) + txh->txh_space_tooverwrite += space; + else + txh->txh_space_towrite += space; + if (bp) + txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); + + dmu_tx_count_twig(txh, dn, parent, level + 1, + blkid >> epbs, freeable, history); } /* ARGSUSED */ @@ -213,7 +226,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) max_ibs = DN_MAX_INDBLKSHIFT; if (dn) { - dmu_buf_impl_t *last[DN_MAX_LEVELS]; + uint64_t history[DN_MAX_LEVELS]; int nlvls = dn->dn_nlevels; int delta; @@ -221,7 +234,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) * For i/o error checking, read the first and last level-0 * blocks (if they are not aligned), and all the level-1 blocks. */ - if (dn->dn_maxblkid == 0) { delta = dn->dn_datablksz; start = (off < dn->dn_datablksz) ? 0 : 1; @@ -247,7 +259,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) /* last level-0 block */ end = (off+len-1) >> dn->dn_datablkshift; - if (end != start && + if (end != start && end <= dn->dn_maxblkid && P2PHASE(off+len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err) @@ -290,29 +302,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) * If this write is not off the end of the file * we need to account for overwrites/unref. */ - if (start <= dn->dn_maxblkid) - bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS); + if (start <= dn->dn_maxblkid) { + for (int l = 0; l < DN_MAX_LEVELS; l++) + history[l] = -1ULL; + } while (start <= dn->dn_maxblkid) { - spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, 0, start, FTAG); + err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); - if (db->db_blkptr && dsl_dataset_block_freeable(ds, - db->db_blkptr->blk_birth)) { - dprintf_bp(db->db_blkptr, "can free old%s", ""); - txh->txh_space_tooverwrite += dn->dn_datablksz; - txh->txh_space_tounref += dn->dn_datablksz; - dmu_tx_count_indirects(txh, db, TRUE, last); - } else { - txh->txh_space_towrite += dn->dn_datablksz; - if (db->db_blkptr) - txh->txh_space_tounref += - bp_get_dasize(spa, db->db_blkptr); - dmu_tx_count_indirects(txh, db, FALSE, last); + + if (err) { + txh->txh_tx->tx_err = err; + return; } + + dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, + history); dbuf_rele(db, FTAG); if (++start > end) { /* @@ -377,13 +384,13 @@ static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; + dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); if (dn && dn->dn_dbuf->db_blkptr && dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr->blk_birth)) { + dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { txh->txh_space_tooverwrite += space; txh->txh_space_tounref += space; } else { @@ -428,7 +435,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. - * Also, dbuf_hold_level() wants us to have the struct_rwlock. + * Also, dbuf_hold_impl() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -458,9 +465,9 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) blkptr_t *bp = dn->dn_phys->dn_blkptr; ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { + if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); - space += bp_get_dasize(spa, bp); + space += bp_get_dsize(spa, bp); } unref += BP_GET_ASIZE(bp); } @@ -516,14 +523,22 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); - - txh->txh_memory_tohold += dbuf->db.db_size; - if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { - txh->txh_tx->tx_err = E2BIG; - dbuf_rele(dbuf, FTAG); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + if (err) { + txh->txh_tx->tx_err = err; break; } + + txh->txh_memory_tohold += dbuf->db.db_size; + + /* + * We don't check memory_tohold against DMU_MAX_ACCESS because + * memory_tohold is an over-estimation (especially the >L1 + * indirect blocks), so it could fail. Callers should have + * already verified that they will not be holding too much + * memory. + */ + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); if (err != 0) { txh->txh_tx->tx_err = err; @@ -535,9 +550,10 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) bp += blkoff; for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + if (dsl_dataset_block_freeable(ds, &bp[i], + bp[i].blk_birth)) { dprintf_bp(&bp[i], "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); + space += bp_get_dsize(spa, &bp[i]); } unref += BP_GET_ASIZE(bp); } @@ -582,6 +598,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) if (len != DMU_OBJECT_END) dmu_tx_count_write(txh, off+len, 1); + dmu_tx_count_dnode(txh); + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) @@ -624,7 +642,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) } } - dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } @@ -674,6 +691,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * the size will change between now and the dbuf dirty call. */ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + &dn->dn_phys->dn_blkptr[0], dn->dn_phys->dn_blkptr[0].blk_birth)) { txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; } else { @@ -689,7 +707,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * access the name in this fat-zap so that we'll check * for i/o errors to the leaf blocks, etc. */ - err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, + err = zap_lookup(dn->dn_objset, dn->dn_object, name, 8, 0, NULL); if (err == EIO) { tx->tx_err = err; @@ -697,7 +715,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) } } - err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add, + err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* @@ -769,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; - dnode_t *dn = db->db_dnode; + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); - ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); - if (tx->tx_anyobj) + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); return; + } /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); return; + } for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { @@ -809,10 +833,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) match_offset = TRUE; /* * We will let this hold work for the bonus - * buffer so that we don't need to hold it - * when creating a new object. + * or spill buffer so that we don't need to + * hold it when creating a new object. */ - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, @@ -833,8 +858,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; case THT_BONUS: - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: @@ -847,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) ASSERT(!"bad txh_type"); } } - if (match_object && match_offset) + if (match_object && match_offset) { + DB_DNODE_EXIT(db); return; + } } + DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); @@ -932,7 +964,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) * assume that we won't be able to free or overwrite anything. */ if (tx->tx_objset && - dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > + dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > tx->tx_lastsnap_txg) { towrite += tooverwrite; tooverwrite = tofree = 0; @@ -1113,8 +1145,13 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", @@ -1143,6 +1180,14 @@ dmu_tx_abort(dmu_tx_t *tx) if (dn != NULL) dnode_rele(dn, tx); } + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG refcount_destroy_many(&tx->tx_space_written, @@ -1159,3 +1204,179 @@ dmu_tx_get_txg(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } + +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_do_callbacks(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while (dcb = list_head(cb_list)) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + int i; + + if (!sa->sa_need_attr_registration) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dnode_t *dn; + dmu_tx_hold_t *txh; + blkptr_t *bp; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + + dn = txh->txh_dnode; + + if (dn == NULL) + return; + + /* If blkptr doesn't exist then add space to towrite */ + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref = 0; + } else { + bp = &dn->dn_phys->dn_spill; + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + bp, bp->blk_birth)) + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + else + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + if (bp->blk_birth) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + } +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_tx_hold_bonus(tx, object); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index acf6284..b5ca666 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -244,7 +244,7 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) break; } zs->zst_ph_offset = prefetch_tail; - zs->zst_last = LBOLT; + zs->zst_last = ddi_get_lbolt(); } void @@ -405,6 +405,7 @@ top: rc = 1; goto out; } + if (zh->zst_offset != zs->zst_offset + zs->zst_len) { mutex_exit(&zs->zst_lock); goto top; @@ -432,6 +433,7 @@ top: rc = 1; goto out; } + if (zh->zst_offset != zs->zst_offset - zh->zst_len) { mutex_exit(&zs->zst_lock); goto top; @@ -462,6 +464,7 @@ top: rc = 1; goto out; } + if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= zs->zst_len) || (zs->zst_len == zs->zst_stride)) { mutex_exit(&zs->zst_lock); @@ -481,6 +484,7 @@ top: rc = 1; goto out; } + if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= zs->zst_len) || (zs->zst_len == zs->zst_stride)) { mutex_exit(&zs->zst_lock); @@ -603,7 +607,7 @@ dmu_zfetch_stream_reclaim(zfetch_t *zf) for (zs = list_head(&zf->zf_stream); zs; zs = list_next(&zf->zf_stream, zs)) { - if (((LBOLT - zs->zst_last) / hz) > zfetch_min_sec_reap) + if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap) break; } @@ -734,7 +738,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; newstream->zst_cap = zst.zst_len; newstream->zst_direction = ZFETCH_FORWARD; - newstream->zst_last = LBOLT; + newstream->zst_last = ddi_get_lbolt(); mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index f9661d6..b43035b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -39,19 +38,35 @@ static int free_range_compar(const void *node1, const void *node2); static kmem_cache_t *dnode_cache; +/* + * Define DNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define DNODE_STATS +#endif /* DEBUG */ + +#ifdef DNODE_STATS +#define DNODE_STAT_ADD(stat) ((stat)++) +#else +#define DNODE_STAT_ADD(stat) /* nothing */ +#endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; +#ifdef sun +static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); +#endif + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { - int i; dnode_t *dn = arg; - bzero(dn, sizeof (dnode_t)); + int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -60,8 +75,18 @@ dnode_cons(void *arg, void *unused, int kmflag) refcount_create(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); + list_link_init(&dn->dn_link); + + bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); + bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); + bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); + bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); + bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); + bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); + bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { + list_link_init(&dn->dn_dirty_link[i]); avl_create(&dn->dn_ranges[i], free_range_compar, sizeof (free_range_t), offsetof(struct free_range, fr_node)); @@ -70,9 +95,28 @@ dnode_cons(void *arg, void *unused, int kmflag) offsetof(dbuf_dirty_record_t, dr_dirty_node)); } + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_have_spill = B_FALSE; + dn->dn_zio = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dn->dn_dbufs_count = 0; list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); + dn->dn_moved = 0; + POINTER_INVALIDATE(&dn->dn_objset); return (0); } @@ -89,27 +133,56 @@ dnode_dest(void *arg, void *unused) cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); + ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); avl_destroy(&dn->dn_ranges[i]); list_destroy(&dn->dn_dirty_records[i]); + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonustype[i], ==, 0); + ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_blksz[i], ==, 0); } + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_free_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT3U(dn->dn_dirtyctx, ==, 0); + ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(!dn->dn_have_spill); + ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT3U(dn->dn_oldused, ==, 0); + ASSERT3U(dn->dn_oldflags, ==, 0); + ASSERT3U(dn->dn_olduid, ==, 0); + ASSERT3U(dn->dn_oldgid, ==, 0); + ASSERT3U(dn->dn_newuid, ==, 0); + ASSERT3U(dn->dn_newgid, ==, 0); + ASSERT3U(dn->dn_id_flags, ==, 0); + + ASSERT3U(dn->dn_dbufs_count, ==, 0); list_destroy(&dn->dn_dbufs); } void dnode_init(void) { + ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + kmem_cache_set_move(dnode_cache, dnode_move); } void dnode_fini(void) { kmem_cache_destroy(dnode_cache); + dnode_cache = NULL; } @@ -121,6 +194,7 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); + ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); @@ -210,6 +284,11 @@ dnode_byteswap(dnode_phys_t *dnp) ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); } + + /* Swap SPILL block if we have one */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); + } void @@ -258,6 +337,27 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); } +void +dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dn->dn_bonustype = newtype; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + dnode_setdirty(dn, tx); + dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; + dn->dn_have_spill = B_FALSE; +} + static void dnode_setdblksz(dnode_t *dn, int size) { @@ -272,18 +372,30 @@ dnode_setdblksz(dnode_t *dn, int size) } static dnode_t * -dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, - uint64_t object) +dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, + uint64_t object, dnode_handle_t *dnh) { dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); - dn->dn_objset = os; + ASSERT(!POINTER_IS_VALID(dn->dn_objset)); + dn->dn_moved = 0; + + /* + * Defer setting dn_objset until the dnode is ready to be a candidate + * for the dnode_move() callback. + */ dn->dn_object = object; dn->dn_dbuf = db; + dn->dn_handle = dnh; dn->dn_phys = dnp; - if (dnp->dn_datablkszsec) + if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; @@ -293,49 +405,71 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_maxblkid = dnp->dn_maxblkid; + dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); + dn->dn_id_flags = 0; dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); + membar_producer(); + /* + * Everything else must be valid before assigning dn_objset makes the + * dnode eligible for dnode_move(). + */ + dn->dn_objset = os; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } +/* + * Caller must be holding the dnode handle, which is released upon return. + */ static void dnode_destroy(dnode_t *dn) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; -#ifdef ZFS_DEBUG - int i; - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); - ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); - } - ASSERT(NULL == list_head(&dn->dn_dbufs)); -#endif - ASSERT(dn->dn_oldphys == NULL); + ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); + POINTER_INVALIDATE(&dn->dn_objset); list_remove(&os->os_dnodes, dn); mutex_exit(&os->os_lock); - if (dn->dn_dirtyctx_firstset) { + /* the dnode can no longer move, so we can release the handle */ + zrl_remove(&dn->dn_handle->dnh_zrlock); + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + + dn->dn_dirtyctx = 0; + if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } - dmu_zfetch_rele(&dn->dn_zfetch); - if (dn->dn_bonus) { + if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_evict(dn->dn_bonus); dn->dn_bonus = NULL; } + dn->dn_zio = NULL; + + dn->dn_have_spill = B_FALSE; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); } @@ -367,6 +501,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(ot != DMU_OT_NONE); ASSERT3U(ot, <, DMU_OT_NUMTYPES); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); @@ -379,9 +514,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_bonustype[i], ==, 0); + ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); ASSERT3U(dn->dn_next_blksz[i], ==, 0); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); @@ -392,7 +530,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; - dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + dn->dn_nblkptr = 1; + else + dn->dn_nblkptr = 1 + + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; @@ -406,10 +548,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, } dn->dn_allocated_txg = tx->tx_txg; + dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } @@ -425,13 +569,16 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || - (bonustype != DMU_OT_NONE && bonuslen != 0)); + (bonustype != DMU_OT_NONE && bonuslen != 0) || + (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); + dn->dn_id_flags = 0; + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_setdirty(dn, tx); if (dn->dn_datablksz != blocksize) { @@ -444,9 +591,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, } if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; - nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + nblkptr = 1; + else + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (dn->dn_bonustype != bonustype) + dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + dbuf_rm_spill(dn, tx); + dnode_rm_spill(dn, tx); + } rw_exit(&dn->dn_struct_rwlock); /* change type */ @@ -472,9 +629,306 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, mutex_exit(&dn->dn_mtx); } +#ifdef DNODE_STATS +static struct { + uint64_t dms_dnode_invalid; + uint64_t dms_dnode_recheck1; + uint64_t dms_dnode_recheck2; + uint64_t dms_dnode_special; + uint64_t dms_dnode_handle; + uint64_t dms_dnode_rwlock; + uint64_t dms_dnode_active; +} dnode_move_stats; +#endif /* DNODE_STATS */ + +static void +dnode_move_impl(dnode_t *odn, dnode_t *ndn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); + ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); + + /* Copy fields. */ + ndn->dn_objset = odn->dn_objset; + ndn->dn_object = odn->dn_object; + ndn->dn_dbuf = odn->dn_dbuf; + ndn->dn_handle = odn->dn_handle; + ndn->dn_phys = odn->dn_phys; + ndn->dn_type = odn->dn_type; + ndn->dn_bonuslen = odn->dn_bonuslen; + ndn->dn_bonustype = odn->dn_bonustype; + ndn->dn_nblkptr = odn->dn_nblkptr; + ndn->dn_checksum = odn->dn_checksum; + ndn->dn_compress = odn->dn_compress; + ndn->dn_nlevels = odn->dn_nlevels; + ndn->dn_indblkshift = odn->dn_indblkshift; + ndn->dn_datablkshift = odn->dn_datablkshift; + ndn->dn_datablkszsec = odn->dn_datablkszsec; + ndn->dn_datablksz = odn->dn_datablksz; + ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + sizeof (odn->dn_next_nblkptr)); + bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + sizeof (odn->dn_next_nlevels)); + bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + sizeof (odn->dn_next_indblkshift)); + bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + sizeof (odn->dn_next_bonustype)); + bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + sizeof (odn->dn_rm_spillblk)); + bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + sizeof (odn->dn_next_bonuslen)); + bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + sizeof (odn->dn_next_blksz)); + for (i = 0; i < TXG_SIZE; i++) { + list_move_tail(&ndn->dn_dirty_records[i], + &odn->dn_dirty_records[i]); + } + bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges)); + ndn->dn_allocated_txg = odn->dn_allocated_txg; + ndn->dn_free_txg = odn->dn_free_txg; + ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirtyctx = odn->dn_dirtyctx; + ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ASSERT(refcount_count(&odn->dn_tx_holds) == 0); + refcount_transfer(&ndn->dn_holds, &odn->dn_holds); + ASSERT(list_is_empty(&ndn->dn_dbufs)); + list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_bonus = odn->dn_bonus; + ndn->dn_have_spill = odn->dn_have_spill; + ndn->dn_zio = odn->dn_zio; + ndn->dn_oldused = odn->dn_oldused; + ndn->dn_oldflags = odn->dn_oldflags; + ndn->dn_olduid = odn->dn_olduid; + ndn->dn_oldgid = odn->dn_oldgid; + ndn->dn_newuid = odn->dn_newuid; + ndn->dn_newgid = odn->dn_newgid; + ndn->dn_id_flags = odn->dn_id_flags; + dmu_zfetch_init(&ndn->dn_zfetch, NULL); + list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); + ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt; + ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail; + + /* + * Update back pointers. Updating the handle fixes the back pointer of + * every descendant dbuf as well as the bonus dbuf. + */ + ASSERT(ndn->dn_handle->dnh_dnode == odn); + ndn->dn_handle->dnh_dnode = ndn; + if (ndn->dn_zfetch.zf_dnode == odn) { + ndn->dn_zfetch.zf_dnode = ndn; + } + + /* + * Invalidate the original dnode by clearing all of its back pointers. + */ + odn->dn_dbuf = NULL; + odn->dn_handle = NULL; + list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + odn->dn_dbufs_count = 0; + odn->dn_bonus = NULL; + odn->dn_zfetch.zf_dnode = NULL; + + /* + * Set the low bit of the objset pointer to ensure that dnode_move() + * recognizes the dnode as invalid in any subsequent callback. + */ + POINTER_INVALIDATE(&odn->dn_objset); + + /* + * Satisfy the destructor. + */ + for (i = 0; i < TXG_SIZE; i++) { + list_create(&odn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + odn->dn_ranges[i].avl_root = NULL; + odn->dn_ranges[i].avl_numnodes = 0; + odn->dn_next_nlevels[i] = 0; + odn->dn_next_indblkshift[i] = 0; + odn->dn_next_bonustype[i] = 0; + odn->dn_rm_spillblk[i] = 0; + odn->dn_next_bonuslen[i] = 0; + odn->dn_next_blksz[i] = 0; + } + odn->dn_allocated_txg = 0; + odn->dn_free_txg = 0; + odn->dn_assigned_txg = 0; + odn->dn_dirtyctx = 0; + odn->dn_dirtyctx_firstset = NULL; + odn->dn_have_spill = B_FALSE; + odn->dn_zio = NULL; + odn->dn_oldused = 0; + odn->dn_oldflags = 0; + odn->dn_olduid = 0; + odn->dn_oldgid = 0; + odn->dn_newuid = 0; + odn->dn_newgid = 0; + odn->dn_id_flags = 0; + + /* + * Mark the dnode. + */ + ndn->dn_moved = 1; + odn->dn_moved = (uint8_t)-1; +} + +#ifdef sun +#ifdef _KERNEL +/*ARGSUSED*/ +static kmem_cbrc_t +dnode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + dnode_t *odn = buf, *ndn = newbuf; + objset_t *os; + int64_t refcount; + uint32_t dbufs; + + /* + * The dnode is on the objset's list of known dnodes if the objset + * pointer is valid. We set the low bit of the objset pointer when + * freeing the dnode to invalidate it, and the memory patterns written + * by kmem (baddcafe and deadbeef) set at least one of the two low bits. + * A newly created dnode sets the objset pointer last of all to indicate + * that the dnode is known and in a valid state to be moved by this + * function. + */ + os = odn->dn_objset; + if (!POINTER_IS_VALID(os)) { + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the objset does not go away during the move. + */ + rw_enter(&os_lock, RW_WRITER); + if (os != odn->dn_objset) { + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the dnode is still valid, then so is the objset. We know that no + * valid objset can be freed while we hold os_lock, so we can safely + * ensure that the objset remains in use. + */ + mutex_enter(&os->os_lock); + + /* + * Recheck the objset pointer in case the dnode was removed just before + * acquiring the lock. + */ + if (os != odn->dn_objset) { + mutex_exit(&os->os_lock); + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold os->os_lock, the dnode + * cannot be freed and fields within the dnode can be safely accessed. + * The objset listing this dnode cannot go away as long as this dnode is + * on its list. + */ + rw_exit(&os_lock); + if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + return (KMEM_CBRC_NO); + } + ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ + + /* + * Lock the dnode handle to prevent the dnode from obtaining any new + * holds. This also prevents the descendant dbufs and the bonus dbuf + * from accessing the dnode, so that we can discount their holds. The + * handle is safe to access because we know that while the dnode cannot + * go away, neither can its handle. Once we hold dnh_zrlock, we can + * safely move any dnode referenced only by dbufs. + */ + if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + return (KMEM_CBRC_LATER); + } + + /* + * Ensure a consistent view of the dnode's holds and the dnode's dbufs. + * We need to guarantee that there is a hold for every dbuf in order to + * determine whether the dnode is actively referenced. Falsely matching + * a dbuf to an active hold would lead to an unsafe move. It's possible + * that a thread already having an active dnode hold is about to add a + * dbuf, and we can't compare hold and dbuf counts while the add is in + * progress. + */ + if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + return (KMEM_CBRC_LATER); + } + + /* + * A dbuf may be removed (evicted) without an active dnode hold. In that + * case, the dbuf count is decremented under the handle lock before the + * dbuf's hold is released. This order ensures that if we count the hold + * after the dbuf is removed but before its hold is released, we will + * treat the unmatched hold as active and exit safely. If we count the + * hold before the dbuf is removed, the hold is discounted, and the + * removal is blocked until the move completes. + */ + refcount = refcount_count(&odn->dn_holds); + ASSERT(refcount >= 0); + dbufs = odn->dn_dbufs_count; + + /* We can't have more dbufs than dnode holds. */ + ASSERT3U(dbufs, <=, refcount); + DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, + uint32_t, dbufs); + + if (refcount > dbufs) { + rw_exit(&odn->dn_struct_rwlock); + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + return (KMEM_CBRC_LATER); + } + + rw_exit(&odn->dn_struct_rwlock); + + /* + * At this point we know that anyone with a hold on the dnode is not + * actively referencing it. The dnode is known and in a valid state to + * move. We're holding the locks needed to execute the critical section. + */ + dnode_move_impl(odn, ndn); + + list_link_replace(&odn->dn_link, &ndn->dn_link); + /* If the dnode was safe to move, the refcount cannot have changed. */ + ASSERT(refcount == refcount_count(&ndn->dn_holds)); + ASSERT(dbufs == ndn->dn_dbufs_count); + zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ + mutex_exit(&os->os_lock); + + return (KMEM_CBRC_YES); +} +#endif /* _KERNEL */ +#endif /* sun */ + void -dnode_special_close(dnode_t *dn) +dnode_special_close(dnode_handle_t *dnh) { + dnode_t *dn = dnh->dnh_dnode; + /* * Wait for final references to the dnode to clear. This can * only happen if the arc is asyncronously evicting state that @@ -483,13 +937,19 @@ dnode_special_close(dnode_t *dn) */ while (refcount_count(&dn->dn_holds) > 0) delay(1); - dnode_destroy(dn); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } dnode_t * -dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object) +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, + dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object); + dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); + dnh->dnh_dnode = dn; + zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); return (dn); } @@ -497,34 +957,43 @@ dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object) static void dnode_buf_pageout(dmu_buf_t *db, void *arg) { - dnode_t **children_dnodes = arg; + dnode_children_t *children_dnodes = arg; int i; int epb = db->db_size >> DNODE_SHIFT; + ASSERT(epb == children_dnodes->dnc_count); + for (i = 0; i < epb; i++) { - dnode_t *dn = children_dnodes[i]; - int n; + dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + dnode_t *dn; - if (dn == NULL) + /* + * The dnode handle lock guards against the dnode moving to + * another valid address, so there is no need here to guard + * against changes to or from NULL. + */ + if (dnh->dnh_dnode == NULL) { + zrl_destroy(&dnh->dnh_zrlock); continue; -#ifdef ZFS_DEBUG + } + + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus - * it wouldn't be eligable for eviction and this function + * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); - ASSERT(list_head(&dn->dn_dbufs) == NULL); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - for (n = 0; n < TXG_SIZE; n++) - ASSERT(!list_link_active(&dn->dn_dirty_link[n])); -#endif - children_dnodes[i] = NULL; - dnode_destroy(dn); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); } /* @@ -534,7 +1003,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg) * succeeds even for free dnodes. */ int -dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, +dnode_hold_impl(objset_t *os, uint64_t object, int flag, void *tag, dnode_t **dnp) { int epb, idx, err; @@ -543,17 +1012,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_t **children_dnodes; + dnode_children_t *children_dnodes; + dnode_handle_t *dnh; /* * If you are holding the spa config lock as writer, you shouldn't - * be asking the DMU to do *anything*. + * be asking the DMU to do *anything* unless it's the root pool + * which may require us to read from the root filesystem while + * holding some (not all) of the locks as writer. */ - ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || + (spa_is_root(os->os_spa) && + spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? - os->os_userused_dnode : os->os_groupused_dnode; + DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) return (ENOENT); type = dn->dn_type; @@ -570,7 +1044,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); - mdn = os->os_meta_dnode; + mdn = DMU_META_DNODE(os); + ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); @@ -597,26 +1072,39 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, idx = object & (epb-1); + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { - dnode_t **winner; - children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), - KM_SLEEP); + int i; + dnode_children_t *winner; + children_dnodes = kmem_alloc(sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP); + children_dnodes->dnc_count = epb; + dnh = &children_dnodes->dnc_children[0]; + for (i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + dnh[i].dnh_dnode = NULL; + } if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, dnode_buf_pageout)) { - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); children_dnodes = winner; } } + ASSERT(children_dnodes->dnc_count == epb); - if ((dn = children_dnodes[idx]) == NULL) { - dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; + dnh = &children_dnodes->dnc_children[idx]; + zrl_add(&dnh->dnh_zrlock); + if ((dn = dnh->dnh_dnode) == NULL) { + dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; dnode_t *winner; - dn = dnode_create(os, dnp, db, object); - winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); + dn = dnode_create(os, phys, db, object, dnh); + winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); if (winner != NULL) { - dnode_destroy(dn); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ dn = winner; } } @@ -626,15 +1114,18 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ((flag & DNODE_MUST_BE_FREE) && - (type != DMU_OT_NONE || dn->dn_oldphys))) { + (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); + zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } mutex_exit(&dn->dn_mtx); if (refcount_add(&dn->dn_holds, tag) == 1) - dbuf_add_ref(db, dn); + dbuf_add_ref(db, dnh); + /* Now we can rely on the hold to prevent the dnode from moving. */ + zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); @@ -649,7 +1140,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, * Return held dnode if the object is allocated, NULL if not. */ int -dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) +dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } @@ -676,19 +1167,43 @@ void dnode_rele(dnode_t *dn, void *tag) { uint64_t refs; + /* Get while the hold prevents the dnode from moving. */ + dmu_buf_impl_t *db = dn->dn_dbuf; + dnode_handle_t *dnh = dn->dn_handle; mutex_enter(&dn->dn_mtx); refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); + + /* + * It's unsafe to release the last hold on a dnode by dnode_rele() or + * indirectly by dbuf_rele() while relying on the dnode handle to + * prevent the dnode from moving, since releasing the last hold could + * result in the dnode's parent dbuf evicting its dnode handles. For + * that reason anyone calling dnode_rele() or dbuf_rele() without some + * other direct or indirect hold on the dnode must first drop the dnode + * handle. + */ + ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ - if (refs == 0 && dn->dn_dbuf) - dbuf_rele(dn->dn_dbuf, dn); + if (refs == 0 && db != NULL) { + /* + * Another thread could add a hold to the dnode handle in + * dnode_hold_impl() while holding the parent dbuf. Since the + * hold on the parent dbuf prevents the handle from being + * destroyed, the hold on the handle is OK. We can't yet assert + * that the handle has zero references, but that will be + * asserted anyway when the handle gets destroyed. + */ + dbuf_rele(db, dnh); + } } void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { @@ -701,10 +1216,15 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); - /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif + /* + * Determine old uid/gid when necessary + */ + dmu_objset_userquota_get_ids(dn, B_TRUE, tx); + mutex_enter(&os->os_lock); /* @@ -719,6 +1239,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) ASSERT(dn->dn_datablksz != 0); ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); + ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", dn->dn_object, txg); @@ -734,7 +1255,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child - * dbuf maintaines a hold on the dnode. When the last child + * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its @@ -813,7 +1334,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } @@ -857,7 +1379,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) int epbs, new_nlevels; uint64_t sz; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : @@ -904,6 +1426,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + ASSERT(db != NULL); new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); @@ -914,7 +1437,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); if (dr->dr_dbuf->db_level != new_nlevels-1 && - dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) { + dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); @@ -1169,6 +1693,20 @@ out: rw_exit(&dn->dn_struct_rwlock); } +static boolean_t +dnode_spill_freed(dnode_t *dn) +{ + int i; + + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) @@ -1177,7 +1715,7 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid) void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) return (FALSE); /* @@ -1190,6 +1728,9 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid) if (dn->dn_free_txg) return (TRUE); + if (blkid == DMU_SPILL_BLKID) + return (dnode_spill_freed(dn)); + range_tofind.fr_blkid = blkid; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { @@ -1247,7 +1788,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; dsl_dataset_t *ds = os->os_dsl_dataset; if (space > 0) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 3bf0c81..32afe7d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -77,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) if (child == NULL) continue; - ASSERT3P(child->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(child); + ASSERT3P(DB_DNODE(child), ==, dn); + DB_DNODE_EXIT(child); +#endif /* DEBUG */ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ASSERT(child->db_parent->db_level == db->db_level); ASSERT(child->db_blkptr != @@ -120,7 +123,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) if (BP_IS_HOLE(bp)) continue; - bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx); + bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); bzero(bp, sizeof (blkptr_t)); blocks_freed += 1; @@ -136,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) int off, num; int i, err, epbs; uint64_t txg = tx->tx_txg; + dnode_t *dn; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; off = start - (db->db_blkid * 1<=, 0); ASSERT3U(num, >=, 0); ASSERT3U(db->db_level, >, 0); - ASSERT3U(db->db.db_size, ==, 1<db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ASSERT(db->db_blkptr != NULL); @@ -156,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_level == 1); - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(db->db_dnode, db->db_level-1, + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, (db->db_blkid << epbs) + i, TRUE, FTAG, &child); - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); @@ -201,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) dbuf_rele(child, FTAG); } + DB_DNODE_EXIT(db); } #endif @@ -210,7 +217,7 @@ static int free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; @@ -228,10 +235,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - arc_release(db->db_buf, db); + dbuf_release_bp(db); bp = (blkptr_t *)db->db.db_data; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; shift = (db->db_level - 1) * epbs; dbstart = db->db_blkid << epbs; start = blkid >> shift; @@ -254,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + DB_DNODE_EXIT(db); return (all ? ALL : blocks_freed); } @@ -273,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, } dbuf_rele(subdb, FTAG); } + DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); #ifdef ZFS_DEBUG bp -= (end-start)+1; @@ -376,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn) for (; db != ▮ db = list_head(&dn->dn_dbufs)) { list_remove(&dn->dn_dbufs, db); list_insert_tail(&dn->dn_dbufs, db); - ASSERT3P(db->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); +#endif /* DEBUG */ mutex_enter(&db->db_mtx); if (db->db_state == DB_EVICTING) { @@ -424,6 +439,9 @@ dnode_undirty_dbufs(list_t *list) dmu_buf_impl_t *db = dr->dr_dbuf; uint64_t txg = dr->dr_txg; + if (db->db_level != 0) + dnode_undirty_dbufs(&dr->dt.di.dr_children); + mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); @@ -431,18 +449,15 @@ dnode_undirty_dbufs(list_t *list) db->db_last_dirty = NULL; db->db_dirtycnt -= 1; if (db->db_level == 0) { - ASSERT(db->db_blkid == DB_BONUS_BLKID || + ASSERT(db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); - mutex_exit(&db->db_mtx); } else { - mutex_exit(&db->db_mtx); - dnode_undirty_dbufs(&dr->dt.di.dr_children); list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } } @@ -493,6 +508,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; + dn->dn_have_spill = B_FALSE; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -515,6 +531,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; static const dnode_phys_t zerodn = { 0 }; + boolean_t kill_spill = B_FALSE; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); @@ -526,10 +543,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) if (dmu_objset_userused_enabled(dn->dn_objset) && !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - ASSERT(dn->dn_oldphys == NULL); - dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t)); - *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */ + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); + dn->dn_oldflags = dn->dn_phys->dn_flags; dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + mutex_exit(&dn->dn_mtx); + dmu_objset_userquota_get_ids(dn, B_FALSE, tx); } else { /* Once we account for it, we should always account for it. */ ASSERT(!(dn->dn_phys->dn_flags & @@ -560,6 +579,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || dn->dn_maxblkid == 0 || list_head(list) != NULL || + avl_last(&dn->dn_ranges[txgoff]) || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec); dnp->dn_datablkszsec = @@ -576,6 +596,24 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_bonuslen[txgoff] = 0; } + if (dn->dn_next_bonustype[txgoff]) { + ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES); + dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; + dn->dn_next_bonustype[txgoff] = 0; + } + + /* + * We will either remove a spill block when a file is being removed + * or we have been asked to remove it. + */ + if (dn->dn_rm_spillblk[txgoff] || + ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && + dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) { + if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + kill_spill = B_TRUE; + dn->dn_rm_spillblk[txgoff] = 0; + } + if (dn->dn_next_indblkshift[txgoff]) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; @@ -592,6 +630,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); + if (kill_spill) { + (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); + mutex_enter(&dn->dn_mtx); + dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + /* process all the "freed" ranges in the file */ while (rp = avl_last(&dn->dn_ranges[txgoff])) { dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index ac9d67f..19b663e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -38,16 +37,24 @@ #include #include #include -#include +#include +#include +#include +#include static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; -static dsl_checkfunc_t dsl_dataset_rollback_check; -static dsl_syncfunc_t dsl_dataset_rollback_sync; static dsl_syncfunc_t dsl_dataset_set_reservation_sync; +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + #define DS_REF_MAX (1ULL << 62) #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE @@ -76,14 +83,14 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) } void -dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { - int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); + int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; - dprintf_bp(bp, "born, ds=%p\n", ds); + dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ @@ -103,6 +110,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); @@ -119,29 +127,26 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) } int -dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, - dmu_tx_t *tx) +dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, + boolean_t async) { - int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); - - ASSERT(pio != NULL); - ASSERT(dmu_tx_is_syncing(tx)); - /* No block pointer => nothing to free */ if (BP_IS_HOLE(bp)) return (0); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(bp->blk_birth <= tx->tx_txg); + + int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + ASSERT(used > 0); if (ds == NULL) { - int err; /* * Account for the meta-objset space in its placeholder * dataset. */ - err = dsl_free(pio, tx->tx_pool, - tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); - ASSERT(err == 0); + dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, -used, -compressed, -uncompressed, tx); @@ -154,13 +159,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { - int err; int64_t delta; - dprintf_bp(bp, "freeing: %s", ""); - err = dsl_free(pio, tx->tx_pool, - tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); - ASSERT(err == 0); + dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); + dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); @@ -176,7 +178,18 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, mutex_exit(&ds->ds_dir->dd_lock); } else { dprintf_bp(bp, "putting on dead list: %s", ""); - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + if (async) { + /* + * We are here as part of zio's write done callback, + * which means we're a zio interrupt thread. We can't + * call dsl_deadlist_insert() now because it may block + * waiting for I/O. Instead, put bp on the deferred + * queue and let dsl_pool_sync() finish the job. + */ + bplist_append(&ds->ds_pending_deadlist, bp); + } else { + dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); + } ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); @@ -189,7 +202,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, ds->ds_prev->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (bp->blk_birth > ds->ds_origin_txg) { + if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -230,9 +243,15 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) } boolean_t -dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) +dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, + uint64_t blk_birth) { - return (blk_birth > dsl_dataset_prev_snap_txg(ds)); + if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) + return (B_FALSE); + + ddt_prefetch(dsl_dataset_get_spa(ds), bp); + + return (B_TRUE); } /* ARGSUSED */ @@ -243,19 +262,23 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); - dprintf_ds(ds, "evicting %s\n", ""); - unique_remove(ds->ds_fsid_guid); - if (ds->ds_user_ptr != NULL) - ds->ds_user_evict_func(ds, ds->ds_user_ptr); + if (ds->ds_objset != NULL) + dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { dsl_dataset_drop_ref(ds->ds_prev, ds); ds->ds_prev = NULL; } - bplist_close(&ds->ds_deadlist); + bplist_destroy(&ds->ds_pending_deadlist); + if (db != NULL) { + dsl_deadlist_close(&ds->ds_deadlist); + } else { + ASSERT(ds->ds_deadlist.dl_dbuf == NULL); + ASSERT(!ds->ds_deadlist.dl_oldfmt); + } if (ds->ds_dir) dsl_dir_close(ds->ds_dir, ds); @@ -264,12 +287,10 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) if (mutex_owned(&ds->ds_lock)) mutex_exit(&ds->ds_lock); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); if (mutex_owned(&ds->ds_opening_lock)) mutex_exit(&ds->ds_opening_lock); mutex_destroy(&ds->ds_opening_lock); - if (mutex_owned(&ds->ds_deadlist.bpl_lock)) - mutex_exit(&ds->ds_deadlist.bpl_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); @@ -329,6 +350,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) matchtype_t mt; int err; + dsl_dir_snap_cmtime_update(ds->ds_dir); + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else @@ -348,6 +371,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; + dmu_object_info_t doi; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); @@ -355,6 +379,12 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err) return (err); + + /* Make sure dsobj has the correct object type. */ + dmu_object_info_from_db(dbuf, &doi); + if (doi.doi_type != DMU_OT_DSL_DATASET) + return (EINVAL); + ds = dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner; @@ -365,28 +395,27 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, - NULL); rw_init(&ds->ds_rwlock, 0, 0, 0); cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); - err = bplist_open(&ds->ds_deadlist, + bplist_create(&ds->ds_pending_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + if (err == 0) { err = dsl_dir_open_obj(dp, ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); } if (err) { - /* - * we don't really need to close the blist if we - * just opened it. - */ mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); @@ -399,21 +428,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } - - if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { - dsl_dataset_t *origin; - - err = dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, - FTAG, &origin); - if (err == 0) { - ds->ds_origin_txg = - origin->ds_phys->ds_creation_txg; - dsl_dataset_rele(origin, FTAG); - } + } else { + if (zfs_flags & ZFS_DEBUG_SNAPNAMES) + err = dsl_dataset_get_snapname(ds); + if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { + err = zap_count( + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj, + &ds->ds_userrefs); } - } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - err = dsl_dataset_get_snapname(ds); } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { @@ -449,13 +472,14 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_evict); } if (err || winner) { - bplist_close(&ds->ds_deadlist); + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); @@ -551,17 +575,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, - dsl_dataset_t **dsp) +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, + void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp); - - ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER); - + int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); if (err) return (err); - if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { - dsl_dataset_rele(*dsp, owner); + if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { + dsl_dataset_rele(*dsp, tag); *dsp = NULL; return (EBUSY); } @@ -628,18 +649,14 @@ out: } int -dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp) +dsl_dataset_own(const char *name, boolean_t inconsistentok, + void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold(name, owner, dsp); + int err = dsl_dataset_hold(name, tag, dsp); if (err) return (err); - if ((*dsp)->ds_phys->ds_num_children > 0 && - !DS_MODE_IS_READONLY(flags)) { - dsl_dataset_rele(*dsp, owner); - return (EROFS); - } - if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { - dsl_dataset_rele(*dsp, owner); + if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { + dsl_dataset_rele(*dsp, tag); return (EBUSY); } return (0); @@ -711,9 +728,9 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag) } void -dsl_dataset_disown(dsl_dataset_t *ds, void *owner) +dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { - ASSERT((ds->ds_owner == owner && ds->ds_dbuf) || + ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); mutex_enter(&ds->ds_lock); @@ -724,20 +741,20 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *owner) } mutex_exit(&ds->ds_lock); if (ds->ds_dbuf) - dsl_dataset_drop_ref(ds, owner); + dsl_dataset_drop_ref(ds, tag); else - dsl_dataset_evict(ds->ds_dbuf, ds); + dsl_dataset_evict(NULL, ds); } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner) +dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) { boolean_t gotit = FALSE; mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { - ds->ds_owner = owner; + ds->ds_owner = tag; if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) rw_exit(&ds->ds_rwlock); gotit = TRUE; @@ -788,10 +805,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; - dsphys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - if (origin) { + if (origin == NULL) { + dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); + } else { + dsl_dataset_t *ohds; + dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; @@ -807,6 +826,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dmu_buf_will_dirty(origin->ds_dbuf, tx); origin->ds_phys->ds_num_children++; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); + dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, + dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); + dsl_dataset_rele(ohds, FTAG); + if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { if (origin->ds_phys->ds_next_clones_obj == 0) { origin->ds_phys->ds_next_clones_obj = @@ -820,6 +845,16 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_origin_obj = origin->ds_object; + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + if (origin->ds_dir->dd_phys->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + origin->ds_dir->dd_phys->dd_clones = + zap_create(mos, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY3U(0, ==, zap_add_int(mos, + origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); + } } if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) @@ -852,6 +887,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dir_close(dd, FTAG); + /* + * If we are creating a clone, make sure we zero out any stale + * data from the origin snapshots zil header. + */ + if (origin != NULL) { + dsl_dataset_t *ds; + objset_t *os; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + dsl_dataset_dirty(ds, tx); + dsl_dataset_rele(ds, FTAG); + } + return (dsobj); } @@ -859,30 +909,29 @@ struct destroyarg { dsl_sync_task_group_t *dstg; char *snapname; char *failed; + boolean_t defer; }; static int -dsl_snapshot_destroy_one(char *name, void *arg) +dsl_snapshot_destroy_one(const char *name, void *arg) { struct destroyarg *da = arg; dsl_dataset_t *ds; - char *cp; int err; + char *dsname; - (void) strcat(name, "@"); - (void) strcat(name, da->snapname); - err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, - da->dstg, &ds); - cp = strchr(name, '@'); - *cp = '\0'; + dsname = kmem_asprintf("%s@%s", name, da->snapname); + err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); + strfree(dsname); if (err == 0) { + struct dsl_ds_destroyarg *dsda; + dsl_dataset_make_exclusive(ds, da->dstg); - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; - } + dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); + dsda->ds = ds; + dsda->defer = da->defer; dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, da->dstg, 0); + dsl_dataset_destroy_sync, dsda, da->dstg, 0); } else if (err == ENOENT) { err = 0; } else { @@ -896,7 +945,7 @@ dsl_snapshot_destroy_one(char *name, void *arg) */ #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname) +dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) { int err; struct destroyarg da; @@ -909,6 +958,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname) da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); da.snapname = snapname; da.failed = fsname; + da.defer = defer; err = dmu_objset_find(fsname, dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); @@ -918,7 +968,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) for (dst = list_head(&da.dstg->dstg_tasks); dst; dst = list_next(&da.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; + struct dsl_ds_destroyarg *dsda = dst->dst_arg1; + dsl_dataset_t *ds = dsda->ds; + /* * Return the file system name that triggered the error */ @@ -926,7 +978,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) dsl_dataset_name(ds, fsname); *strchr(fsname, '@') = '\0'; } + ASSERT3P(dsda->rm_origin, ==, NULL); dsl_dataset_disown(ds, da.dstg); + kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } dsl_sync_task_group_destroy(da.dstg); @@ -934,34 +988,94 @@ dsl_snapshots_destroy(char *fsname, char *snapname) return (err); } +static boolean_t +dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) +{ + boolean_t might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds)) + might_destroy = B_TRUE; + mutex_exit(&ds->ds_lock); + + return (might_destroy); +} + +/* + * If we're removing a clone, and these three conditions are true: + * 1) the clone's origin has no other children + * 2) the clone's origin has no user references + * 3) the clone's origin has been marked for deferred destruction + * Then, prepare to remove the origin as part of this sync task group. + */ +static int +dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *origin = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(origin)) { + char *name; + int namelen; + int error; + + namelen = dsl_dataset_namelen(origin) + 1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(origin, name); +#ifdef _KERNEL + error = zfs_unmount_snap(name, NULL); + if (error) { + kmem_free(name, namelen); + return (error); + } +#endif + error = dsl_dataset_own(name, B_TRUE, tag, &origin); + kmem_free(name, namelen); + if (error) + return (error); + dsda->rm_origin = origin; + dsl_dataset_make_exclusive(origin, tag); + } + + return (0); +} + /* * ds must be opened as OWNER. On return (whether successful or not), * ds will be closed and caller can no longer dereference it. */ int -dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) +dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) { int err; dsl_sync_task_group_t *dstg; objset_t *os; dsl_dir_t *dd; uint64_t obj; + struct dsl_ds_destroyarg dsda = { 0 }; + dsl_dataset_t dummy_ds = { 0 }; + + dsda.ds = ds; if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; - } + dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - ds, tag, 0); + &dsda, tag, 0); + ASSERT3P(dsda.rm_origin, ==, NULL); + goto out; + } else if (defer) { + err = EINVAL; goto out; } dd = ds->ds_dir; + dummy_ds.ds_dir = dd; + dummy_ds.ds_object = ds->ds_object; /* * Check for errors and mark this ds as inconsistent, in @@ -972,7 +1086,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) if (err) goto out; - err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); + err = dmu_objset_from_ds(ds, &os); if (err) goto out; @@ -988,11 +1102,16 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) */ (void) dmu_free_object(os, obj); } + if (err != ESRCH) + goto out; + + /* + * Only the ZIL knows how to free log blocks. + */ + zil_destroy(dmu_objset_zil(os), B_FALSE); /* - * We need to sync out all in-flight IO before we try to evict - * (the dataset evict func is trying to clear the cached entries - * for this dataset in the ARC). + * Sync out all in-flight IO. */ txg_wait_synced(dd->dd_pool, 0); @@ -1001,7 +1120,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) * context, the user space accounting should be zero. */ if (ds->ds_phys->ds_bp.blk_fill == 0 && - dmu_objset_userused_enabled(os->os)) { + dmu_objset_userused_enabled(os)) { uint64_t count; ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || @@ -1010,10 +1129,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) count == 0); } - dmu_objset_close(os); - if (err != ESRCH) - goto out; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); rw_exit(&dd->dd_pool->dp_config_rwlock); @@ -1021,30 +1136,48 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) if (err) goto out; - if (ds->ds_user_ptr) { - /* - * We need to sync out all in-flight IO before we try - * to evict (the dataset evict func is trying to clear - * the cached entries for this dataset in the ARC). - */ - txg_wait_synced(dd->dd_pool, 0); - } - /* * Blow away the dsl_dir + head dataset. */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; - } - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, tag, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, dd, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); + /* + * If we're removing a clone, we might also need to remove its + * origin. + */ + do { + dsda.need_prep = B_FALSE; + if (dsl_dir_is_clone(dd)) { + err = dsl_dataset_origin_rm_prep(&dsda, tag); + if (err) { + dsl_dir_close(dd, FTAG); + goto out; + } + } + + dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, &dsda, tag, 0); + dsl_sync_task_create(dstg, dsl_dir_destroy_check, + dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + + /* + * We could be racing against 'zfs release' or 'zfs destroy -d' + * on the origin snap, in which case we can get EBUSY if we + * needed to destroy the origin snap but were not ready to + * do so. + */ + if (dsda.need_prep) { + ASSERT(err == EBUSY); + ASSERT(dsl_dir_is_clone(dd)); + ASSERT(dsda.rm_origin == NULL); + } + } while (dsda.need_prep); + + if (dsda.rm_origin != NULL) + dsl_dataset_disown(dsda.rm_origin, tag); + /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); @@ -1053,47 +1186,6 @@ out: return (err); } -int -dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost) -{ - int err; - - ASSERT(ds->ds_owner); - - dsl_dataset_make_exclusive(ds, ds->ds_owner); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_rollback_check, dsl_dataset_rollback_sync, - ds, &ost, 0); - /* drop exclusive access */ - mutex_enter(&ds->ds_lock); - rw_exit(&ds->ds_rwlock); - cv_broadcast(&ds->ds_exclusive_cv); - mutex_exit(&ds->ds_lock); - return (err); -} - -void * -dsl_dataset_set_user_ptr(dsl_dataset_t *ds, - void *p, dsl_dataset_evict_func_t func) -{ - void *old; - - mutex_enter(&ds->ds_lock); - old = ds->ds_user_ptr; - if (old == NULL) { - ds->ds_user_ptr = p; - ds->ds_user_evict_func = func; - } - mutex_exit(&ds->ds_lock); - return (old); -} - -void * -dsl_dataset_get_user_ptr(dsl_dataset_t *ds) -{ - return (ds->ds_user_ptr); -} - blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { @@ -1127,7 +1219,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) if (ds == NULL) /* this is the meta-objset */ return; - ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_objset != NULL); if (ds->ds_phys->ds_next_snap_obj != 0) panic("dirtying snapshot!"); @@ -1154,62 +1246,51 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); + ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; else mrs_used = 0; - VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, - &dluncomp)); + dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = ds->ds_phys->ds_used_bytes - (mrs_used - dlused); - if (!DS_UNIQUE_IS_ACCURATE(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) >= + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } -static uint64_t -dsl_dataset_unique(dsl_dataset_t *ds) -{ - if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) - dsl_dataset_recalc_head_uniq(ds); - - return (ds->ds_phys->ds_unique_bytes); -} - struct killarg { dsl_dataset_t *ds; - zio_t *zio; dmu_tx_t *tx; }; /* ARGSUSED */ static int -kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; + dmu_tx_t *tx = ka->tx; if (bp == NULL) return (0); - if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) || - (zb->zb_object != 0 && dnp == NULL)) { + if (zb->zb_level == ZB_ZIL_LEVEL) { + ASSERT(zilog != NULL); /* * It's a block in the intent log. It has no * accounting, so just free it. */ - VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool, - ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT)); + dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { + ASSERT(zilog == NULL); ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); - (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); + (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } return (0); @@ -1217,143 +1298,6 @@ kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, /* ARGSUSED */ static int -dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dmu_objset_type_t *ost = arg2; - - /* - * We can only roll back to emptyness if it is a ZPL objset. - */ - if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) - return (EINVAL); - - /* - * This must not be a snapshot. - */ - if (ds->ds_phys->ds_next_snap_obj != 0) - return (EINVAL); - - /* - * If we made changes this txg, traverse_dataset won't find - * them. Try again. - */ - if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) - return (EAGAIN); - - return (0); -} - -/* ARGSUSED */ -static void -dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dmu_objset_type_t *ost = arg2; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - - if (ds->ds_user_ptr != NULL) { - /* - * We need to make sure that the objset_impl_t is reopened after - * we do the rollback, otherwise it will have the wrong - * objset_phys_t. Normally this would happen when this - * dataset-open is closed, thus causing the - * dataset to be immediately evicted. But when doing "zfs recv - * -F", we reopen the objset before that, so that there is no - * window where the dataset is closed and inconsistent. - */ - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; - } - - /* Transfer space that was freed since last snap back to the head. */ - { - uint64_t used; - - VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist, - ds->ds_origin_txg, UINT64_MAX, &used)); - dsl_dir_transfer_space(ds->ds_dir, used, - DD_USED_SNAP, DD_USED_HEAD, tx); - } - - /* Zero out the deadlist. */ - bplist_close(&ds->ds_deadlist); - bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); - ds->ds_phys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, - ds->ds_phys->ds_deadlist_obj)); - - { - /* - * Free blkptrs that we gave birth to - this covers - * claimed but not played log blocks too. - */ - zio_t *zio; - struct killarg ka; - - zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED); - ka.ds = ds; - ka.zio = zio; - ka.tx = tx; - (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - TRAVERSE_POST, kill_blkptr, &ka); - (void) zio_wait(zio); - } - - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); - - if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { - /* Change our contents to that of the prev snapshot */ - - ASSERT3U(ds->ds_prev->ds_object, ==, - ds->ds_phys->ds_prev_snap_obj); - ASSERT3U(ds->ds_phys->ds_used_bytes, <=, - ds->ds_prev->ds_phys->ds_used_bytes); - - ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; - ds->ds_phys->ds_used_bytes = - ds->ds_prev->ds_phys->ds_used_bytes; - ds->ds_phys->ds_compressed_bytes = - ds->ds_prev->ds_phys->ds_compressed_bytes; - ds->ds_phys->ds_uncompressed_bytes = - ds->ds_prev->ds_phys->ds_uncompressed_bytes; - ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; - - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_unique_bytes = 0; - } - } else { - objset_impl_t *osi; - - ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); - ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); - ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); - - bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); - ds->ds_phys->ds_flags = 0; - ds->ds_phys->ds_unique_bytes = 0; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= - SPA_VERSION_UNIQUE_ACCURATE) - ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - - osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, - &ds->ds_phys->ds_bp, *ost, tx); -#ifdef _KERNEL - zfs_create_fs(&osi->os, kcred, NULL, tx); -#endif - } - - spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa, - tx, cr, "dataset = %llu", ds->ds_object); -} - -/* ARGSUSED */ -static int dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; @@ -1368,7 +1312,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EINVAL); + return (EBUSY); /* * This is really a dsl_dir thing, but check it here so that @@ -1386,7 +1330,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_pool_t *dp = ds->ds_dir->dd_pool; @@ -1395,22 +1339,72 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, + "dataset = %llu", ds->ds_object); +} + +static int +dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *ds_prev = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(ds_prev)) { + struct dsl_ds_destroyarg ndsda = {0}; + + /* + * If we're not prepared to remove the origin, don't remove + * the clone either. + */ + if (dsda->rm_origin == NULL) { + dsda->need_prep = B_TRUE; + return (EBUSY); + } + + ndsda.ds = ds_prev; + ndsda.is_origin_rm = B_TRUE; + return (dsl_dataset_destroy_check(&ndsda, tag, tx)); + } + + /* + * If we're not going to remove the origin after all, + * undo the open context setup. + */ + if (dsda->rm_origin != NULL) { + dsl_dataset_disown(dsda->rm_origin, tag); + dsda->rm_origin = NULL; + } + + return (0); } +/* + * If you add new checks here, you may need to add + * additional checks to the "temporary" case in + * snapshot_check() in dmu_objset.c. + */ /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; /* we have an owner hold, so noone else can destroy us */ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - /* Can't delete a branch point. */ - if (ds->ds_phys->ds_num_children > 1) - return (EEXIST); + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (dsda->defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (ENOTSUP); + ASSERT(dsl_dataset_is_snapshot(ds)); + return (0); + } /* * Can't delete a head dataset if there are snapshots of it. @@ -1419,7 +1413,7 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EINVAL); + return (EBUSY); /* * If we made changes this txg, traverse_dsl_dataset won't find @@ -1428,6 +1422,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) return (EAGAIN); + if (dsl_dataset_is_snapshot(ds)) { + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0 && !dsda->releasing) + return (EBUSY); + + mutex_enter(&ds->ds_lock); + /* + * Can't delete a branch point. However, if we're destroying + * a clone and removing its origin due to it having a user + * hold count of 0 and having been marked for deferred destroy, + * it's OK for the origin to have a single clone. + */ + if (ds->ds_phys->ds_num_children > + (dsda->is_origin_rm ? 2 : 1)) { + mutex_exit(&ds->ds_lock); + return (EEXIST); + } + mutex_exit(&ds->ds_lock); + } else if (dsl_dir_is_clone(ds->ds_dir)) { + return (dsl_dataset_origin_check(dsda, arg2, tx)); + } + /* XXX we should do some i/o error checking... */ return (0); } @@ -1500,24 +1519,132 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); } +static void +dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (ds->ds_dir->dd_phys->dd_clones == 0) + return; + + for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + dsl_dataset_remove_clones_key(clone, mintxg, tx); + } + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); +} + +struct process_old_arg { + dsl_dataset_t *ds; + dsl_dataset_t *ds_prev; + boolean_t after_branch_point; + zio_t *pio; + uint64_t used, comp, uncomp; +}; + +static int +process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct process_old_arg *poa = arg; + dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + + if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + if (poa->ds_prev && !poa->after_branch_point && + bp->blk_birth > + poa->ds_prev->ds_phys->ds_prev_snap_txg) { + poa->ds_prev->ds_phys->ds_unique_bytes += + bp_get_dsize_sync(dp->dp_spa, bp); + } + } else { + poa->used += bp_get_dsize_sync(dp->dp_spa, bp); + poa->comp += BP_GET_PSIZE(bp); + poa->uncomp += BP_GET_UCSIZE(bp); + dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + } + return (0); +} + +static void +process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, + dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) +{ + struct process_old_arg poa = { 0 }; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + ASSERT(ds->ds_deadlist.dl_oldfmt); + ASSERT(ds_next->ds_deadlist.dl_oldfmt); + + poa.ds = ds; + poa.ds_prev = ds_prev; + poa.after_branch_point = after_branch_point; + poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, + process_old_cb, &poa, tx)); + VERIFY3U(zio_wait(poa.pio), ==, 0); + ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -poa.used, -poa.comp, -poa.uncomp, tx); + + /* swap next's deadlist to our deadlist */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_close(&ds_next->ds_deadlist); + SWITCH64(ds_next->ds_phys->ds_deadlist_obj, + ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj); +} + void -dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - zio_t *zio; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; int err; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *ds_prev = NULL; + boolean_t wont_destroy; uint64_t obj; - ASSERT(ds->ds_owner); - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + wont_destroy = (dsda->defer && + (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); + + ASSERT(ds->ds_owner || wont_destroy); + ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + if (wont_destroy) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + return; + } + /* signal any waiters that this dataset is going away */ mutex_enter(&ds->ds_lock); ds->ds_owner = dsl_reaper; @@ -1526,14 +1653,21 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) /* Remove our reservation */ if (ds->ds_reserved != 0) { - uint64_t val = 0; - dsl_dataset_set_reservation_sync(ds, &val, cr, tx); + dsl_prop_setarg_t psa; + uint64_t value = 0; + + dsl_prop_setarg_init_uint64(&psa, "refreservation", + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + &value); + psa.psa_effective_value = 0; /* predict default value */ + + dsl_dataset_set_reservation_sync(ds, &psa, tx); ASSERT3U(ds->ds_reserved, ==, 0); } ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - dsl_pool_ds_destroyed(ds, tx); + dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; @@ -1562,26 +1696,36 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); ds_prev->ds_phys->ds_num_children--; + + /* + * If the clone's origin has no other clones, no + * user holds, and has been marked for deferred + * deletion, then we should have done the necessary + * destroy setup for it. + */ + if (ds_prev->ds_phys->ds_num_children == 1 && + ds_prev->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds_prev)) { + ASSERT3P(dsda->rm_origin, !=, NULL); + } else { + ASSERT3P(dsda->rm_origin, ==, NULL); + } } else if (!after_branch_point) { ds_prev->ds_phys->ds_next_snap_obj = ds->ds_phys->ds_next_snap_obj; } } - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - - if (ds->ds_phys->ds_next_snap_obj != 0) { - blkptr_t bp; + if (dsl_dataset_is_snapshot(ds)) { dsl_dataset_t *ds_next; - uint64_t itor = 0; uint64_t old_unique; - int64_t used = 0, compressed = 0, uncompressed = 0; + uint64_t used = 0, comp = 0, uncomp = 0; VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); - old_unique = dsl_dataset_unique(ds_next); + old_unique = ds_next->ds_phys->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = @@ -1591,53 +1735,49 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); - /* - * Transfer to our deadlist (which will become next's - * new deadlist) any entries from next's current - * deadlist which were born before prev, and free the - * other entries. - * - * XXX we're doing this long task with the config lock held - */ - while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { - if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, - &bp, tx)); - if (ds_prev && !after_branch_point && - bp.blk_birth > - ds_prev->ds_phys->ds_prev_snap_txg) { - ds_prev->ds_phys->ds_unique_bytes += - bp_get_dasize(dp->dp_spa, &bp); - } - } else { - used += bp_get_dasize(dp->dp_spa, &bp); - compressed += BP_GET_PSIZE(&bp); - uncompressed += BP_GET_UCSIZE(&bp); - /* XXX check return value? */ - (void) dsl_free(zio, dp, tx->tx_txg, - &bp, NULL, NULL, ARC_NOWAIT); - } - } - ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + if (ds_next->ds_deadlist.dl_oldfmt) { + process_old_deadlist(ds, ds_prev, ds_next, + after_branch_point, tx); + } else { + /* Adjust prev's unique space. */ + if (ds_prev && !after_branch_point) { + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds_prev->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_prev_snap_txg, + &used, &comp, &uncomp); + ds_prev->ds_phys->ds_unique_bytes += used; + } - /* change snapused */ - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -used, -compressed, -uncompressed, tx); + /* Adjust snapused. */ + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -comp, -uncomp, tx); + + /* Move blocks to be freed to pool's free list. */ + dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, + &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, + tx); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, comp, uncomp, tx); + dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); + + /* Merge our deadlist into next's and free it. */ + dsl_deadlist_merge(&ds_next->ds_deadlist, + ds->ds_phys->ds_deadlist_obj, tx); + } + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); - /* free next's deadlist */ - bplist_close(&ds_next->ds_deadlist); - bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); + /* Collapse range in clone heads */ + dsl_dataset_remove_clones_key(ds, + ds->ds_phys->ds_creation_txg, tx); - /* set next's deadlist to our deadlist */ - bplist_close(&ds->ds_deadlist); - ds_next->ds_phys->ds_deadlist_obj = - ds->ds_phys->ds_deadlist_obj; - VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj)); - ds->ds_phys->ds_deadlist_obj = 0; + if (dsl_dataset_is_snapshot(ds_next)) { + dsl_dataset_t *ds_nextnext; - if (ds_next->ds_phys->ds_next_snap_obj != 0) { /* * Update next's unique to include blocks which * were previously shared by only this snapshot @@ -1646,25 +1786,27 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) * died after the next snap and before the one * after that (ie. be on the snap after next's * deadlist). - * - * XXX we're doing this long task with the - * config lock held */ - dsl_dataset_t *ds_after_next; - uint64_t space; - VERIFY(0 == dsl_dataset_hold_obj(dp, ds_next->ds_phys->ds_next_snap_obj, - FTAG, &ds_after_next)); - - VERIFY(0 == - bplist_space_birthrange(&ds_after_next->ds_deadlist, + FTAG, &ds_nextnext)); + dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_creation_txg, &space)); - ds_next->ds_phys->ds_unique_bytes += space; - - dsl_dataset_rele(ds_after_next, FTAG); + ds->ds_phys->ds_creation_txg, + &used, &comp, &uncomp); + ds_next->ds_phys->ds_unique_bytes += used; + dsl_dataset_rele(ds_nextnext, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); + + /* Collapse range in this head. */ + dsl_dataset_t *hds; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, + FTAG, &hds)); + dsl_deadlist_remove_key(&hds->ds_deadlist, + ds->ds_phys->ds_creation_txg, tx); + dsl_dataset_rele(hds, FTAG); + } else { ASSERT3P(ds_next->ds_prev, ==, ds); dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); @@ -1704,9 +1846,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) */ struct killarg ka; - ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); - bplist_close(&ds->ds_deadlist); - bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; /* @@ -1717,17 +1858,32 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) * freed all the objects in open context. */ ka.ds = ds; - ka.zio = zio; ka.tx = tx; err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); + + if (ds->ds_prev != NULL) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY3U(0, ==, zap_remove_int(mos, + ds->ds_prev->ds_dir->dd_phys->dd_clones, + ds->ds_object, tx)); + } + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = ds_prev = NULL; + } } - err = zio_wait(zio); - ASSERT3U(err, ==, 0); + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { /* Erase the link in the dir */ @@ -1762,8 +1918,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dsl_dataset_rele(ds_prev, FTAG); spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, + "dataset = %llu", ds->ds_object); if (ds->ds_phys->ds_next_clones_obj != 0) { uint64_t count; @@ -1774,10 +1930,22 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) } if (ds->ds_phys->ds_props_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + if (ds->ds_phys->ds_userrefs_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); dsl_dir_close(ds->ds_dir, ds); ds->ds_dir = NULL; dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); + + if (dsda->rm_origin) { + /* + * Remove the origin of the clone we just destroyed. + */ + struct dsl_ds_destroyarg ndsda = {0}; + + ndsda.ds = dsda->rm_origin; + dsl_dataset_destroy_sync(&ndsda, tag, tx); + } } static int @@ -1793,8 +1961,9 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ - asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) + ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); + asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); /* @@ -1807,7 +1976,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ int dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1848,7 +2016,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *snapname = arg2; @@ -1919,25 +2087,31 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * since our unique space is going to zero. */ if (ds->ds_reserved) { - int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + int64_t delta; + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, - add, 0, 0, tx); + delta, 0, 0, tx); } - bplist_close(&ds->ds_deadlist); dmu_buf_will_dirty(ds->ds_dbuf, tx); + zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", + ds->ds_dir->dd_myname, snapname, dsobj, + ds->ds_phys->ds_prev_snap_txg); + ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, + UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_add_key(&ds->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); ds->ds_phys->ds_prev_snap_obj = dsobj; ds->ds_phys->ds_prev_snap_txg = crtxg; ds->ds_phys->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - ds->ds_phys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, - ds->ds_phys->ds_deadlist_obj)); - dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx); ASSERT(err == 0); @@ -1947,9 +2121,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); - dsl_pool_ds_snapshotted(ds, tx); + dsl_scan_ds_snapshotted(ds, tx); + + dsl_dir_snap_cmtime_update(ds->ds_dir); - spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, "dataset = %llu", dsobj); } @@ -1957,7 +2133,7 @@ void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_objset != NULL); ASSERT(ds->ds_phys->ds_next_snap_obj == 0); /* @@ -1968,7 +2144,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; dsl_dir_dirty(ds->ds_dir, tx); - dmu_objset_sync(ds->ds_user_ptr, zio, tx); + dmu_objset_sync(ds->ds_objset, zio, tx); } void @@ -1992,6 +2168,14 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, + ds->ds_phys->ds_unique_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, + ds->ds_object); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, + ds->ds_userrefs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, + DS_IS_DEFER_DESTROY(ds) ? 1 : 0); if (ds->ds_phys->ds_next_snap_obj) { /* @@ -2075,8 +2259,21 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) if (ds->ds_prev == NULL) return (B_FALSE); if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (B_TRUE); + ds->ds_prev->ds_phys->ds_creation_txg) { + objset_t *os, *os_prev; + /* + * It may be that only the ZIL differs, because it was + * reset in the head. Don't count that as being + * modified. + */ + if (dmu_objset_from_ds(ds, &os) != 0) + return (B_TRUE); + if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) + return (B_TRUE); + return (bcmp(&os->os_phys->os_meta_dnode, + &os_prev->os_phys->os_meta_dnode, + sizeof (os->os_phys->os_meta_dnode)) != 0); + } return (B_FALSE); } @@ -2113,8 +2310,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx) +dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *newsnapname = arg2; @@ -2138,8 +2334,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, ds->ds_snapname, 8, 1, &ds->ds_object, tx); ASSERT3U(err, ==, 0); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, + "dataset = %llu", ds->ds_object); dsl_dataset_rele(hds, FTAG); } @@ -2151,43 +2347,36 @@ struct renamesnaparg { }; static int -dsl_snapshot_rename_one(char *name, void *arg) +dsl_snapshot_rename_one(const char *name, void *arg) { struct renamesnaparg *ra = arg; dsl_dataset_t *ds = NULL; - char *cp; + char *snapname; int err; - cp = name + strlen(name); - *cp = '@'; - (void) strcpy(cp + 1, ra->oldsnap); + snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); + (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); /* * For recursive snapshot renames the parent won't be changing * so we just pass name for both the to/from argument. */ - err = zfs_secpolicy_rename_perms(name, name, CRED()); - if (err == ENOENT) { - return (0); - } else if (err) { - (void) strcpy(ra->failed, name); - return (err); + err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); + if (err != 0) { + strfree(snapname); + return (err == ENOENT ? 0 : err); } #ifdef _KERNEL /* * For all filesystems undergoing rename, we'll need to unmount it. */ - (void) zfs_unmount_snap(name, NULL); + (void) zfs_unmount_snap(snapname, NULL); #endif - err = dsl_dataset_hold(name, ra->dstg, &ds); - *cp = '\0'; - if (err == ENOENT) { - return (0); - } else if (err) { - (void) strcpy(ra->failed, name); - return (err); - } + err = dsl_dataset_hold(snapname, ra->dstg, &ds); + strfree(snapname); + if (err != 0) + return (err == ENOENT ? 0 : err); dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); @@ -2203,7 +2392,7 @@ dsl_recursive_rename(char *oldname, const char *newname) dsl_sync_task_t *dst; spa_t *spa; char *cp, *fsname = spa_strdup(oldname); - int len = strlen(oldname); + int len = strlen(oldname) + 1; /* truncate the snapshot name to get the fsname */ cp = strchr(fsname, '@'); @@ -2211,7 +2400,7 @@ dsl_recursive_rename(char *oldname, const char *newname) err = spa_open(fsname, &spa, FTAG); if (err) { - kmem_free(fsname, len + 1); + kmem_free(fsname, len); return (err); } ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); @@ -2223,7 +2412,7 @@ dsl_recursive_rename(char *oldname, const char *newname) err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, DS_FIND_CHILDREN); - kmem_free(fsname, len + 1); + kmem_free(fsname, len); if (err == 0) { err = dsl_sync_task_group_wait(ra->dstg); @@ -2234,14 +2423,15 @@ dsl_recursive_rename(char *oldname, const char *newname) dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) { dsl_dir_name(ds->ds_dir, ra->failed); - (void) strcat(ra->failed, "@"); - (void) strcat(ra->failed, ra->newsnap); + (void) strlcat(ra->failed, "@", sizeof (ra->failed)); + (void) strlcat(ra->failed, ra->newsnap, + sizeof (ra->failed)); } dsl_dataset_rele(ds, ra->dstg); } if (err) - (void) strcpy(oldname, ra->failed); + (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); dsl_sync_task_group_destroy(ra->dstg); kmem_free(ra, sizeof (struct renamesnaparg)); @@ -2250,7 +2440,7 @@ dsl_recursive_rename(char *oldname, const char *newname) } static int -dsl_valid_rename(char *oldname, void *arg) +dsl_valid_rename(const char *oldname, void *arg) { int delta = *(int *)arg; @@ -2272,12 +2462,7 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) err = dsl_dir_open(oldname, FTAG, &dd, &tail); if (err) return (err); - /* - * If there are more than 2 references there may be holds - * hanging around that haven't been cleared out yet. - */ - if (dmu_buf_refcount(dd->dd_dbuf) > 2) - txg_wait_synced(dd->dd_pool, 0); + if (tail == NULL) { int delta = strlen(newname) - strlen(oldname); @@ -2286,13 +2471,14 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) err = dmu_objset_find(oldname, dsl_valid_rename, &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); - if (!err) + if (err == 0) err = dsl_dir_rename(dd, newname); dsl_dir_close(dd, FTAG); return (err); } + if (tail[0] != '@') { - /* the name ended in a nonexistant component */ + /* the name ended in a nonexistent component */ dsl_dir_close(dd, FTAG); return (ENOENT); } @@ -2331,13 +2517,14 @@ struct promotenode { struct promotearg { list_t shared_snaps, origin_snaps, clone_snaps; - dsl_dataset_t *origin_origin, *origin_head; + dsl_dataset_t *origin_origin; uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; + char *err_ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); +static boolean_t snaplist_unstable(list_t *l); -/* ARGSUSED */ static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -2346,6 +2533,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; int err; + uint64_t unused; /* Check that it is a real clone */ if (!dsl_dir_is_clone(hds->ds_dir)) @@ -2361,10 +2549,9 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) /* compute origin's new unique space */ snap = list_tail(&pa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); - err = bplist_space_birthrange(&snap->ds->ds_deadlist, - origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); - if (err) - return (err); + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &pa->unique, &unused, &unused); /* * Walk the snapshots that we are moving @@ -2392,18 +2579,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) /* Check that the snapshot name does not conflict */ VERIFY(0 == dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); - if (err == 0) - return (EEXIST); + if (err == 0) { + err = EEXIST; + goto out; + } if (err != ENOENT) - return (err); + goto out; /* The very first snapshot does not have a deadlist */ if (ds->ds_phys->ds_prev_snap_obj == 0) continue; - if (err = bplist_space(&ds->ds_deadlist, - &dlused, &dlcomp, &dluncomp)) - return (err); + dsl_deadlist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp); pa->used += dlused; pa->comp += dlcomp; pa->uncomp += dluncomp; @@ -2436,19 +2624,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * Note, typically this will not be a clone of a clone, - * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so - * these snaplist_space() -> bplist_space_birthrange() + * so dd_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&pa->origin_snaps); err = snaplist_space(&pa->shared_snaps, - snap->ds->ds_origin_txg, &pa->cloneusedsnap); + snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); if (err) return (err); err = snaplist_space(&pa->clone_snaps, - snap->ds->ds_origin_txg, &space); + snap->ds->ds_dir->dd_origin_txg, &space); if (err) return (err); pa->cloneusedsnap += space; @@ -2461,10 +2649,13 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) } return (0); +out: + pa->err_ds = snap->ds->ds_snapname; + return (err); } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; @@ -2508,10 +2699,31 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; - hds->ds_origin_txg = origin_head->ds_origin_txg; + dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); odd->dd_phys->dd_origin_obj = origin_ds->ds_object; - origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; + origin_head->ds_dir->dd_origin_txg = + origin_ds->ds_phys->ds_creation_txg; + + /* change dd_clone entries */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + odd->dd_phys->dd_clones, hds->ds_object, tx)); + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + pa->origin_origin->ds_dir->dd_phys->dd_clones, + hds->ds_object, tx)); + + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + pa->origin_origin->ds_dir->dd_phys->dd_clones, + origin_head->ds_object, tx)); + if (dd->dd_phys->dd_clones == 0) { + dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + dd->dd_phys->dd_clones, origin_head->ds_object, tx)); + + } /* move snapshots to this dir */ for (snap = list_head(&pa->shared_snaps); snap; @@ -2519,9 +2731,9 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_dataset_t *ds = snap->ds; /* unregister props as dsl_dir is changing */ - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; } /* move snap name entry */ VERIFY(0 == dsl_dataset_get_snapname(ds)); @@ -2530,6 +2742,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); @@ -2539,6 +2752,40 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); + /* move any clone references */ + if (ds->ds_phys->ds_next_clones_obj && + spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *cnds; + uint64_t o; + + if (za.za_first_integer == oldnext_obj) { + /* + * We've already moved the + * origin's reference. + */ + continue; + } + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &cnds)); + o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; + + VERIFY3U(zap_remove_int(dp->dp_meta_objset, + odd->dd_phys->dd_clones, o, tx), ==, 0); + VERIFY3U(zap_add_int(dp->dp_meta_objset, + dd->dd_phys->dd_clones, o, tx), ==, 0); + dsl_dataset_rele(cnds, FTAG); + } + zap_cursor_fini(&zc); + } + ASSERT3U(dsl_prop_numcb(ds), ==, 0); } @@ -2568,8 +2815,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) origin_ds->ds_phys->ds_unique_bytes = pa->unique; /* log history record */ - spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", hds->ds_object); + spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, + "dataset = %llu", hds->ds_object); dsl_dir_close(odd, FTAG); } @@ -2634,11 +2881,9 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { - uint64_t used; - int err = bplist_space_birthrange(&snap->ds->ds_deadlist, - mintxg, UINT64_MAX, &used); - if (err) - return (err); + uint64_t used, comp, uncomp; + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); @@ -2673,7 +2918,7 @@ snaplist_destroy(list_t *l, boolean_t own) * NULL, indicating that the clone is not a clone of a clone). */ int -dsl_dataset_promote(const char *name) +dsl_dataset_promote(const char *name, char *conflsnap) { dsl_dataset_t *ds; dsl_dir_t *dd; @@ -2725,10 +2970,10 @@ dsl_dataset_promote(const char *name) if (err != 0) goto out; - if (dsl_dir_is_clone(snap->ds->ds_dir)) { - err = dsl_dataset_own_obj(dp, + if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { + err = dsl_dataset_hold_obj(dp, snap->ds->ds_dir->dd_phys->dd_origin_obj, - 0, FTAG, &pa.origin_origin); + FTAG, &pa.origin_origin); if (err != 0) goto out; } @@ -2744,14 +2989,16 @@ out: if (err == 0) { err = dsl_sync_task_do(dp, dsl_dataset_promote_check, dsl_dataset_promote_sync, ds, &pa, - 2 + 2 * doi.doi_physical_blks); + 2 + 2 * doi.doi_physical_blocks_512); + if (err && pa.err_ds && conflsnap) + (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); } snaplist_destroy(&pa.shared_snaps, B_TRUE); snaplist_destroy(&pa.clone_snaps, B_FALSE); snaplist_destroy(&pa.origin_snaps, B_FALSE); if (pa.origin_origin) - dsl_dataset_disown(pa.origin_origin, FTAG); + dsl_dataset_rele(pa.origin_origin, FTAG); dsl_dataset_rele(ds, FTAG); return (err); } @@ -2778,9 +3025,11 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) if (csa->cds->ds_prev != csa->ohds->ds_prev) return (EINVAL); - /* cds should be the clone */ - if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj != - csa->ohds->ds_object) + /* cds should be the clone (unless they are unrelated) */ + if (csa->cds->ds_prev != NULL && + csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && + csa->ohds->ds_object != + csa->cds->ds_prev->ds_phys->ds_next_snap_obj) return (EINVAL); /* the clone should be a child of the origin */ @@ -2803,38 +3052,49 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); + if (csa->ohds->ds_quota != 0 && + csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) + return (EDQUOT); + return (0); } /* ARGSUSED */ static void -dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; ASSERT(csa->cds->ds_reserved == 0); - ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); + ASSERT(csa->ohds->ds_quota == 0 || + csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); - dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); - if (csa->cds->ds_user_ptr != NULL) { - csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr); - csa->cds->ds_user_ptr = NULL; + if (csa->cds->ds_objset != NULL) { + dmu_objset_evict(csa->cds->ds_objset); + csa->cds->ds_objset = NULL; } - if (csa->ohds->ds_user_ptr != NULL) { - csa->ohds->ds_user_evict_func(csa->ohds, - csa->ohds->ds_user_ptr); - csa->ohds->ds_user_ptr = NULL; + if (csa->ohds->ds_objset != NULL) { + dmu_objset_evict(csa->ohds->ds_objset); + csa->ohds->ds_objset = NULL; } - /* reset origin's unique bytes */ - VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &csa->cds->ds_prev->ds_phys->ds_unique_bytes)); + /* + * Reset origin's unique bytes, if it exists. + */ + if (csa->cds->ds_prev) { + dsl_dataset_t *origin = csa->cds->ds_prev; + uint64_t comp, uncomp; + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + dsl_deadlist_space_range(&csa->cds->ds_deadlist, + origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); + } /* swap blkptrs */ { @@ -2853,10 +3113,10 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ASSERT3U(csa->cds->ds_dir->dd_phys-> dd_used_breakdown[DD_USED_SNAP], ==, 0); - VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, - &cdl_comp, &cdl_uncomp)); - VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, - &odl_comp, &odl_uncomp)); + dsl_deadlist_space(&csa->cds->ds_deadlist, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space(&csa->ohds->ds_deadlist, + &odl_used, &odl_comp, &odl_uncomp); dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - (csa->ohds->ds_phys->ds_used_bytes + odl_used); @@ -2877,21 +3137,16 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * deadlist (since that's the only thing that's * changing that affects the snapused). */ - VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); - VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); + dsl_deadlist_space_range(&csa->cds->ds_deadlist, + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space_range(&csa->ohds->ds_deadlist, + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, + &odl_used, &odl_comp, &odl_uncomp); dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } -#define SWITCH64(x, y) \ - { \ - uint64_t __tmp = (x); \ - (x) = (y); \ - (y) = __tmp; \ - } - /* swap ds_*_bytes */ SWITCH64(csa->ohds->ds_phys->ds_used_bytes, csa->cds->ds_phys->ds_used_bytes); @@ -2906,22 +3161,26 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, csa->unused_refres_delta, 0, 0, tx); - /* swap deadlists */ - bplist_close(&csa->cds->ds_deadlist); - bplist_close(&csa->ohds->ds_deadlist); + /* + * Swap deadlists. + */ + dsl_deadlist_close(&csa->cds->ds_deadlist); + dsl_deadlist_close(&csa->ohds->ds_deadlist); SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, csa->cds->ds_phys->ds_deadlist_obj); - VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, - csa->cds->ds_phys->ds_deadlist_obj)); - VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, - csa->ohds->ds_phys->ds_deadlist_obj)); + dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, + csa->cds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, + csa->ohds->ds_phys->ds_deadlist_obj); - dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); + dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); } /* - * Swap 'clone' with its origin head file system. Used at the end - * of "online recv" to swizzle the file system to the new version. + * Swap 'clone' with its origin head datasets. Used at the end of "zfs + * recv" into an existing fs to swizzle the file system to the new + * version, and by "zfs rollback". Can also be used to swap two + * independent head datasets if neither has any snapshots. */ int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, @@ -2933,9 +3192,14 @@ dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, ASSERT(clone->ds_owner); ASSERT(origin_head->ds_owner); retry: - /* Need exclusive access for the swap */ - rw_enter(&clone->ds_rwlock, RW_WRITER); - if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { + /* + * Need exclusive access for the swap. If we're swapping these + * datasets back after an error, we already hold the locks. + */ + if (!RW_WRITE_HELD(&clone->ds_rwlock)) + rw_enter(&clone->ds_rwlock, RW_WRITER); + if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && + !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { rw_exit(&clone->ds_rwlock); rw_enter(&origin_head->ds_rwlock, RW_WRITER); if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { @@ -3030,62 +3294,70 @@ static int dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; + dsl_prop_setarg_t *psa = arg2; + int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) return (ENOTSUP); - if (new_quota == 0) + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + if (psa->psa_effective_value == 0) return (0); - if (new_quota < ds->ds_phys->ds_used_bytes || - new_quota < ds->ds_reserved) + if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || + psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); return (0); } -/* ARGSUSED */ +extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); + void -dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; - ds->ds_quota = new_quota; + dsl_prop_set_sync(ds, psa, tx); + DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); - dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + if (ds->ds_quota != effective_value) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_quota = effective_value; - spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu ", - (longlong_t)new_quota, ds->ds_object); + spa_history_log_internal(LOG_DS_REFQUOTA, + ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", + (longlong_t)ds->ds_quota, ds->ds_object); + } } int -dsl_dataset_set_quota(const char *dsname, uint64_t quota) +dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) { dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; + dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); - if (quota != ds->ds_quota) { - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(ds->ds_dir->dd_pool, 0); + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(ds->ds_dir->dd_pool, 0); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, + ds, &psa, 0); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, - ds, "a, 0); - } dsl_dataset_rele(ds, FTAG); return (err); } @@ -3094,9 +3366,10 @@ static int dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value; uint64_t unique; + int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFRESERVATION) @@ -3105,6 +3378,11 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) if (dsl_dataset_is_snapshot(ds)) return (EINVAL); + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + effective_value = psa->psa_effective_value; + /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. @@ -3113,67 +3391,645 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); + if (!DS_UNIQUE_IS_ACCURATE(ds)) + dsl_dataset_recalc_head_uniq(ds); + unique = ds->ds_phys->ds_unique_bytes; mutex_exit(&ds->ds_lock); - if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) { - uint64_t delta = MAX(unique, new_reservation) - + if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, effective_value) - MAX(unique, ds->ds_reserved); if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); if (ds->ds_quota > 0 && - new_reservation > ds->ds_quota) + effective_value > ds->ds_quota) return (ENOSPC); } return (0); } -/* ARGSUSED */ static void -dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx) +dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; uint64_t unique; int64_t delta; + dsl_prop_set_sync(ds, psa, tx); + DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); + dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); - delta = MAX(0, (int64_t)(new_reservation - unique)) - + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + unique = ds->ds_phys->ds_unique_bytes; + delta = MAX(0, (int64_t)(effective_value - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); - ds->ds_reserved = new_reservation; + ds->ds_reserved = effective_value; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); - dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", - new_reservation, cr, tx); - spa_history_internal_log(LOG_DS_REFRESERV, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", - (longlong_t)new_reservation, ds->ds_object); + spa_history_log_internal(LOG_DS_REFRESERV, + ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", + (longlong_t)effective_value, ds->ds_object); } int -dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) +dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, + uint64_t reservation) { dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; + dsl_prop_setarg_init_uint64(&psa, "refreservation", source, + &reservation); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_reservation_check, - dsl_dataset_set_reservation_sync, ds, &reservation, 0); + dsl_dataset_set_reservation_sync, ds, &psa, 0); + dsl_dataset_rele(ds, FTAG); return (err); } + +typedef struct zfs_hold_cleanup_arg { + dsl_pool_t *dp; + uint64_t dsobj; + char htag[MAXNAMELEN]; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + + (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, + B_TRUE); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); +} + +void +dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, + minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca; + + ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); + ca->dp = ds->ds_dir->dd_pool; + ca->dsobj = ds->ds_object; + (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); + VERIFY3U(0, ==, zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} + +/* + * If you add new checks here, you may need to add + * additional checks to the "temporary" case in + * snapshot_check() in dmu_objset.c. + */ +static int +dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct dsl_ds_holdarg *ha = arg2; + char *htag = ha->htag; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int error = 0; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + /* tags must be unique */ + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj) { + error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, + 8, 1, tx); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; + } + mutex_exit(&ds->ds_lock); + + if (error == 0 && ha->temphold && + strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + error = E2BIG; + + return (error); +} + +void +dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct dsl_ds_holdarg *ha = arg2; + char *htag = ha->htag; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t now = gethrestime_sec(); + uint64_t zapobj; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = ds->ds_phys->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = ds->ds_phys->ds_userrefs_obj; + } + ds->ds_userrefs++; + mutex_exit(&ds->ds_lock); + + VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + if (ha->temphold) { + VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, + htag, &now, tx)); + } + + spa_history_log_internal(LOG_DS_USER_HOLD, + dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, + (int)ha->temphold, ds->ds_object); +} + +static int +dsl_dataset_user_hold_one(const char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + dsl_dataset_t *ds; + int error; + char *name; + + /* alloc a buffer to hold dsname@snapname plus terminating NULL */ + name = kmem_asprintf("%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, ha->dstg, &ds); + strfree(name); + if (error == 0) { + ha->gotone = B_TRUE; + dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, ds, ha, 0); + } else if (error == ENOENT && ha->recursive) { + error = 0; + } else { + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + } + return (error); +} + +int +dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, + boolean_t temphold) +{ + struct dsl_ds_holdarg *ha; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + ha->htag = htag; + ha->temphold = temphold; + error = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, + ds, ha, 0); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + + return (error); +} + +int +dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive, boolean_t temphold, int cleanup_fd) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + minor_t minor = 0; + + if (cleanup_fd != -1) { + /* Currently we only support cleanup-on-exit of tempholds. */ + if (!temphold) + return (EINVAL); + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error) + return (error); + } + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + if (cleanup_fd != -1) + zfs_onexit_fd_rele(cleanup_fd); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + ha->temphold = temphold; + + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_hold_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + + if (dst->dst_err) { + dsl_dataset_name(ds, ha->failed); + *strchr(ha->failed, '@') = '\0'; + } else if (error == 0 && minor != 0 && temphold) { + /* + * If this hold is to be released upon process exit, + * register that action now. + */ + dsl_register_onexit_hold_cleanup(ds, htag, minor); + } + dsl_dataset_rele(ds, ha->dstg); + } + + if (error == 0 && recursive && !ha->gotone) + error = ENOENT; + + if (error) + (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); + + dsl_sync_task_group_destroy(ha->dstg); + + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + if (cleanup_fd != -1) + zfs_onexit_fd_rele(cleanup_fd); + return (error); +} + +struct dsl_ds_releasearg { + dsl_dataset_t *ds; + const char *htag; + boolean_t own; /* do we own or just hold ds? */ +}; + +static int +dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, + boolean_t *might_destroy) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj; + uint64_t tmp; + int error; + + *might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + zapobj = ds->ds_phys->ds_userrefs_obj; + if (zapobj == 0) { + /* The tag can't possibly exist */ + mutex_exit(&ds->ds_lock); + return (ESRCH); + } + + /* Make sure the tag exists */ + error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); + if (error) { + mutex_exit(&ds->ds_lock); + if (error == ENOENT) + error = ESRCH; + return (error); + } + + if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) + *might_destroy = B_TRUE; + + mutex_exit(&ds->ds_lock); + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + boolean_t might_destroy; + int error; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); + if (error) + return (error); + + if (might_destroy) { + struct dsl_ds_destroyarg dsda = {0}; + + if (dmu_tx_is_syncing(tx)) { + /* + * If we're not prepared to remove the snapshot, + * we can't allow the release to happen right now. + */ + if (!ra->own) + return (EBUSY); + } + dsda.ds = ds; + dsda.releasing = B_TRUE; + return (dsl_dataset_destroy_check(&dsda, tag, tx)); + } + + return (0); +} + +static void +dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + uint64_t dsobj = ds->ds_object; + uint64_t refs; + int error; + + mutex_enter(&ds->ds_lock); + ds->ds_userrefs--; + refs = ds->ds_userrefs; + mutex_exit(&ds->ds_lock); + error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); + VERIFY(error == 0 || error == ENOENT); + zapobj = ds->ds_phys->ds_userrefs_obj; + VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); + if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) { + struct dsl_ds_destroyarg dsda = {0}; + + ASSERT(ra->own); + dsda.ds = ds; + dsda.releasing = B_TRUE; + /* We already did the destroy_check */ + dsl_dataset_destroy_sync(&dsda, tag, tx); + } + + spa_history_log_internal(LOG_DS_USER_RELEASE, + dp->dp_spa, tx, "<%s> %lld dataset = %llu", + ra->htag, (longlong_t)refs, dsobj); +} + +static int +dsl_dataset_user_release_one(const char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + struct dsl_ds_releasearg *ra; + dsl_dataset_t *ds; + int error; + void *dtag = ha->dstg; + char *name; + boolean_t own = B_FALSE; + boolean_t might_destroy; + + /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ + name = kmem_asprintf("%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, dtag, &ds); + strfree(name); + if (error == ENOENT && ha->recursive) + return (0); + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + if (error) + return (error); + + ha->gotone = B_TRUE; + + ASSERT(dsl_dataset_is_snapshot(ds)); + + error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } + + if (might_destroy) { +#ifdef _KERNEL + name = kmem_asprintf("%s@%s", dsname, ha->snapname); + error = zfs_unmount_snap(name, NULL); + strfree(name); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } +#endif + if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { + dsl_dataset_rele(ds, dtag); + return (EBUSY); + } else { + own = B_TRUE; + dsl_dataset_make_exclusive(ds, dtag); + } + } + + ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); + ra->ds = ds; + ra->htag = ha->htag; + ra->own = own; + dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, ra, dtag, 0); + + return (0); +} + +int +dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + +top: + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_release_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_release_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + struct dsl_ds_releasearg *ra = dst->dst_arg1; + dsl_dataset_t *ds = ra->ds; + + if (dst->dst_err) + dsl_dataset_name(ds, ha->failed); + + if (ra->own) + dsl_dataset_disown(ds, ha->dstg); + else + dsl_dataset_rele(ds, ha->dstg); + + kmem_free(ra, sizeof (struct dsl_ds_releasearg)); + } + + if (error == 0 && recursive && !ha->gotone) + error = ENOENT; + + if (error && error != EBUSY) + (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); + + dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + + /* + * We can get EBUSY if we were racing with deferred destroy and + * dsl_dataset_user_release_check() hadn't done the necessary + * open context setup. We can also get EBUSY if we're racing + * with destroy and that thread is the ds_owner. Either way + * the busy condition should be transient, and we should retry + * the release operation. + */ + if (error == EBUSY) + goto top; + + return (error); +} + +/* + * Called at spa_load time (with retry == B_FALSE) to release a stale + * temporary user hold. Also called by the onexit code (with retry == B_TRUE). + */ +int +dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, + boolean_t retry) +{ + dsl_dataset_t *ds; + char *snap; + char *name; + int namelen; + int error; + + do { + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) + return (error); + namelen = dsl_dataset_namelen(ds)+1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(ds, name); + dsl_dataset_rele(ds, FTAG); + + snap = strchr(name, '@'); + *snap = '\0'; + ++snap; + error = dsl_dataset_user_release(name, snap, htag, B_FALSE); + kmem_free(name, namelen); + + /* + * The object can't have been destroyed because we have a hold, + * but it might have been renamed, resulting in ENOENT. Retry + * if we've been requested to do so. + * + * It would be nice if we could use the dsobj all the way + * through and avoid ENOENT entirely. But we might need to + * unmount the snapshot, and there's currently no way to lookup + * a vfsp using a ZFS object id. + */ + } while ((error == ENOENT) && retry); + + return (error); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); + if (ds->ds_phys->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, + za->za_first_integer)); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* + * Note, this fuction is used as the callback for dmu_objset_find(). We + * always return 0 so that we will continue to find and process + * inconsistent datasets, even if we encounter an error trying to + * process one of them. + */ +/* ARGSUSED */ +int +dsl_destroy_inconsistent(const char *dsname, void *arg) +{ + dsl_dataset_t *ds; + + if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { + if (DS_IS_INCONSISTENT(ds)) + (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); + else + dsl_dataset_disown(ds, FTAG); + } + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c new file mode 100644 index 0000000..064f8ac --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c @@ -0,0 +1,474 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +static int +dsl_deadlist_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_entry_t *dle1 = arg1; + const dsl_deadlist_entry_t *dle2 = arg2; + + if (dle1->dle_mintxg < dle2->dle_mintxg) + return (-1); + else if (dle1->dle_mintxg > dle2->dle_mintxg) + return (+1); + else + return (0); +} + +static void +dsl_deadlist_load_tree(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havetree) + return; + + avl_create(&dl->dl_tree, dsl_deadlist_compare, + sizeof (dsl_deadlist_entry_t), + offsetof(dsl_deadlist_entry_t, dle_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, + za.za_first_integer)); + avl_add(&dl->dl_tree, dle); + } + zap_cursor_fini(&zc); + dl->dl_havetree = B_TRUE; +} + +void +dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + + mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); + dl->dl_os = os; + dl->dl_object = object; + VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + dmu_object_info_from_db(dl->dl_dbuf, &doi); + if (doi.doi_type == DMU_OT_BPOBJ) { + dmu_buf_rele(dl->dl_dbuf, dl); + dl->dl_dbuf = NULL; + dl->dl_oldfmt = B_TRUE; + VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); + return; + } + + dl->dl_oldfmt = B_FALSE; + dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_havetree = B_FALSE; +} + +void +dsl_deadlist_close(dsl_deadlist_t *dl) +{ + void *cookie = NULL; + dsl_deadlist_entry_t *dle; + + if (dl->dl_oldfmt) { + dl->dl_oldfmt = B_FALSE; + bpobj_close(&dl->dl_bpobj); + return; + } + + if (dl->dl_havetree) { + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) + != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + } + dmu_buf_rele(dl->dl_dbuf, dl); + mutex_destroy(&dl->dl_lock); + dl->dl_dbuf = NULL; + dl->dl_phys = NULL; +} + +uint64_t +dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) +{ + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx)); + return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, + sizeof (dsl_deadlist_phys_t), tx)); +} + +void +dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) +{ + dmu_object_info_t doi; + zap_cursor_t zc; + zap_attribute_t za; + + VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_free(os, dlobj, tx); + return; + } + + for (zap_cursor_init(&zc, os, dlobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) + bpobj_free(os, za.za_first_integer, tx); + zap_cursor_fini(&zc); + VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); +} + +void +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + if (dl->dl_oldfmt) { + bpobj_enqueue(&dl->dl_bpobj, bp, tx); + return; + } + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + mutex_enter(&dl->dl_lock); + dl->dl_phys->dl_used += + bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); + mutex_exit(&dl->dl_lock); + + dle_tofind.dle_mintxg = bp->blk_birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + else + dle = AVL_PREV(&dl->dl_tree, dle); + bpobj_enqueue(&dle->dle_bpobj, bp, tx); +} + +/* + * Insert new key in deadlist, which must be > all current entries. + * mintxg is not inclusive. + */ +void +dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t obj; + dsl_deadlist_entry_t *dle; + + if (dl->dl_oldfmt) + return; + + dsl_deadlist_load_tree(dl); + + dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = mintxg; + obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + avl_add(&dl->dl_tree, dle); + + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, + mintxg, obj, tx)); +} + +/* + * Remove this key, merging its entries into the previous key. + */ +void +dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle, *dle_prev; + + if (dl->dl_oldfmt) + return; + + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + dle_prev = AVL_PREV(&dl->dl_tree, dle); + + bpobj_enqueue_subobj(&dle_prev->dle_bpobj, + dle->dle_bpobj.bpo_object, tx); + + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); +} + +/* + * Walk ds's snapshots to regenerate generate ZAP & AVL. + */ +static void +dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_t dl; + dsl_pool_t *dp = dmu_objset_pool(os); + + dsl_deadlist_open(&dl, os, dlobj); + if (dl.dl_oldfmt) { + dsl_deadlist_close(&dl); + return; + } + + while (mrs_obj != 0) { + dsl_dataset_t *ds; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); + dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx); + mrs_obj = ds->ds_phys->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + dsl_deadlist_close(&dl); +} + +uint64_t +dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t *dle; + uint64_t newobj; + + newobj = dsl_deadlist_alloc(dl->dl_os, tx); + + if (dl->dl_oldfmt) { + dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); + return (newobj); + } + + dsl_deadlist_load_tree(dl); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t obj; + + if (dle->dle_mintxg >= maxtxg) + break; + + obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, + dle->dle_mintxg, obj, tx)); + } + return (newobj); +} + +void +dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, + usedp, compp, uncompp)); + return; + } + + mutex_enter(&dl->dl_lock); + *usedp = dl->dl_phys->dl_used; + *compp = dl->dl_phys->dl_comp; + *uncompp = dl->dl_phys->dl_uncomp; + mutex_exit(&dl->dl_lock); +} + +/* + * return space used in the range (mintxg, maxtxg]. + * Includes maxtxg, does not include mintxg. + * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is + * UINT64_MAX). + */ +void +dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, + mintxg, maxtxg, usedp, compp, uncompp)); + return; + } + + dsl_deadlist_load_tree(dl); + *usedp = *compp = *uncompp = 0; + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + /* + * If we don't find this mintxg, there shouldn't be anything + * after it either. + */ + ASSERT(dle != NULL || + avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); + for (; dle && dle->dle_mintxg < maxtxg; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t used, comp, uncomp; + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + + *usedp += used; + *compp += comp; + *uncompp += uncomp; + } +} + +static void +dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + uint64_t used, comp, uncomp; + bpobj_t bpo; + + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); + bpobj_close(&bpo); + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + mutex_enter(&dl->dl_lock); + dl->dl_phys->dl_used += used; + dl->dl_phys->dl_comp += comp; + dl->dl_phys->dl_uncomp += uncomp; + mutex_exit(&dl->dl_lock); + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); +} + +static int +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + +/* + * Merge the deadlist pointed to by 'obj' into dl. obj will be left as + * an empty deadlist. + */ +void +dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + dmu_buf_t *bonus; + dsl_deadlist_phys_t *dlp; + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_t bpo; + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_iterate(&bpo, + dsl_deadlist_insert_cb, dl, tx)); + bpobj_close(&bpo); + return; + } + + for (zap_cursor_init(&zc, dl->dl_os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t mintxg = strtonum(za.za_name, NULL); + dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); + } + zap_cursor_fini(&zc); + + VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); + dlp = bonus->db_data; + dmu_buf_will_dirty(bonus, tx); + bzero(dlp, sizeof (*dlp)); + dmu_buf_rele(bonus, FTAG); +} + +/* + * Remove entries on dl that are >= mintxg, and put them on the bpobj. + */ +void +dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(!dl->dl_oldfmt); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + while (dle) { + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t *dle_next; + + bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + mutex_enter(&dl->dl_lock); + ASSERT3U(dl->dl_phys->dl_used, >=, used); + ASSERT3U(dl->dl_phys->dl_comp, >=, comp); + ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + mutex_exit(&dl->dl_lock); + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, + dle->dle_mintxg, tx)); + + dle_next = AVL_NEXT(&dl->dl_tree, dle); + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + dle = dle_next; + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c index 7ff8430..b85c373 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -75,8 +74,6 @@ #include #include #include -#include -#include /* for the default checksum value */ #include #include #include @@ -150,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) } static void -dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; nvlist_t *nvp = arg2; @@ -185,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(zap_update(mos, jumpobj, perm, 8, 1, &n, tx) == 0); - spa_history_internal_log(LOG_DS_PERM_UPDATE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_UPDATE, + dd->dd_pool->dp_spa, tx, "%s %s dataset = %llu", whokey, perm, dd->dd_phys->dd_head_dataset_obj); } @@ -194,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } static void -dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; nvlist_t *nvp = arg2; @@ -217,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) (void) zap_remove(mos, zapobj, whokey, tx); VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE, + dd->dd_pool->dp_spa, tx, "%s dataset = %llu", whokey, dd->dd_phys->dd_head_dataset_obj); continue; @@ -238,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_REMOVE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_REMOVE, + dd->dd_pool->dp_spa, tx, "%s %s dataset = %llu", whokey, perm, dd->dd_phys->dd_head_dataset_obj); } @@ -531,9 +528,8 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, * Check if user has requested permission. */ int -dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) { - dsl_dataset_t *ds; dsl_dir_t *dd; dsl_pool_t *dp; void *cookie; @@ -543,23 +539,15 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) avl_tree_t permsets; perm_set_t *setnode; - error = dsl_dataset_hold(dsname, FTAG, &ds); - if (error) - return (error); - dp = ds->ds_dir->dd_pool; mos = dp->dp_meta_objset; - if (dsl_delegation_on(mos) == B_FALSE) { - dsl_dataset_rele(ds, FTAG); + if (dsl_delegation_on(mos) == B_FALSE) return (ECANCELED); - } if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < - SPA_VERSION_DELEGATED_PERMS) { - dsl_dataset_rele(ds, FTAG); + SPA_VERSION_DELEGATED_PERMS) return (EPERM); - } if (dsl_dataset_is_snapshot(ds)) { /* @@ -589,7 +577,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) if (dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_ZONED), - 8, 1, &zoned, NULL) != 0) + 8, 1, &zoned, NULL, B_FALSE) != 0) break; if (!zoned) break; @@ -636,7 +624,6 @@ again: error = EPERM; success: rw_exit(&dp->dp_config_rwlock); - dsl_dataset_rele(ds, FTAG); cookie = NULL; while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) @@ -645,6 +632,22 @@ success: return (error); } +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dsname, FTAG, &ds); + if (error) + return (error); + + error = dsl_deleg_access_impl(ds, perm, cr); + dsl_dataset_rele(ds, FTAG); + + return (error); +} + /* * Other routines. */ @@ -739,5 +742,5 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) boolean_t dsl_delegation_on(objset_t *os) { - return (os->os->os_spa->spa_delegation); + return (!!spa_delegation(os->os_spa)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index 2f312ae..1cd49c8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -32,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -39,8 +39,7 @@ #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx); +static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); /* ARGSUSED */ @@ -63,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) spa_close(dd->dd_pool->dp_spa, dd); /* - * The props callback list should be empty since they hold the - * dir open. + * The props callback list should have been cleaned up by + * objset_evict(). */ list_destroy(&dd->dd_prop_cbs); mutex_destroy(&dd->dd_lock); @@ -107,6 +106,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_node)); + dsl_dir_snap_cmtime_update(dd); + if (dd->dd_phys->dd_parent_obj) { err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); @@ -133,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } + if (dsl_dir_is_clone(dd)) { + dmu_buf_t *origin_bonus; + dsl_dataset_phys_t *origin_phys; + + /* + * We can't open the origin dataset, because + * that would require opening this dsl_dir. + * Just look at its phys directly instead. + */ + err = dmu_bonus_hold(dp->dp_meta_objset, + dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); + if (err) + goto errout; + origin_phys = origin_bonus->db_data; + dd->dd_origin_txg = + origin_phys->ds_creation_txg; + dmu_buf_rele(origin_bonus, FTAG); + } + winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, dsl_dir_evict); if (winner) { @@ -392,7 +412,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; - dsl_dir_phys_t *dsphys; + dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, @@ -407,17 +427,17 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + ddphys = dbuf->db_data; - dsphys->dd_creation_time = gethrestime_sec(); + ddphys->dd_creation_time = gethrestime_sec(); if (pds) - dsphys->dd_parent_obj = pds->dd_object; - dsphys->dd_props_zapobj = zap_create(mos, + ddphys->dd_parent_obj = pds->dd_object; + ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - dsphys->dd_child_dir_zapobj = zap_create(mos, + ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) - dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; + ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); @@ -427,7 +447,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, int dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err; @@ -454,19 +475,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val, obj; + dsl_prop_setarg_t psa; + uint64_t value = 0; + uint64_t obj; dd_used_t t; ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); /* Remove our reservation. */ - val = 0; - dsl_dir_set_reservation_sync(dd, &val, cr, tx); + dsl_prop_setarg_init_uint64(&psa, "reservation", + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + &value); + psa.psa_effective_value = 0; /* predict default value */ + + dsl_dir_set_reservation_sync(ds, &psa, tx); + ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); for (t = 0; t < DD_USED_NUM; t++) @@ -640,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd, if (used > quota) { /* over quota */ myspace = 0; - - /* - * While it's OK to be a little over quota, if - * we think we are using more space than there - * is in the pool (which is already 1.6% more than - * dsl_pool_adjustedsize()), something is very - * wrong. - */ - ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa)); } else { /* * the lesser of the space provided by our parent and @@ -676,8 +696,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, { uint64_t txg = tx->tx_txg; uint64_t est_inflight, used_on_disk, quota, parent_rsrv; + uint64_t deferred = 0; struct tempreserve *tr; - int enospc = EDQUOT; + int retval = EDQUOT; int txgidx = txg & TXG_MASK; int i; uint64_t ref_rsrv = 0; @@ -703,7 +724,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, */ if (first && tx->tx_objset) { int error; - dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset; + dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, checkrefquota, asize, est_inflight, &used_on_disk, &ref_rsrv); @@ -723,7 +744,8 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, quota = dd->dd_phys->dd_quota; /* - * Adjust the quota against the actual pool size at the root. + * Adjust the quota against the actual pool size at the root + * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, @@ -732,10 +754,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * removes to get through. */ if (dd->dd_parent == NULL) { + spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); - if (poolsize < quota) { - quota = poolsize; - enospc = ENOSPC; + deferred = metaslab_class_get_deferred(spa_normal_class(spa)); + if (poolsize - deferred < quota) { + quota = poolsize - deferred; + retval = ENOSPC; } } @@ -745,15 +769,16 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (used_on_disk + est_inflight > quota) { - if (est_inflight > 0 || used_on_disk < quota) - enospc = ERESTART; + if (used_on_disk + est_inflight >= quota) { + if (est_inflight > 0 || used_on_disk < quota || + (retval == ENOSPC && used_on_disk < quota + deferred)) + retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", used_on_disk>>10, est_inflight>>10, - quota>>10, asize>>10, enospc); + quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); - return (enospc); + return (retval); } /* We need to up our estimated delta before dropping dd_lock */ @@ -987,13 +1012,16 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, static int dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; - int err = 0; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + int err; uint64_t towrite; - if (new_quota == 0) + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + if (psa->psa_effective_value == 0) return (0); mutex_enter(&dd->dd_lock); @@ -1005,64 +1033,88 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) */ towrite = dsl_dir_space_towrite(dd); if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (new_quota < dd->dd_phys->dd_reserved || - new_quota < dd->dd_phys->dd_used_bytes + towrite)) { + (psa->psa_effective_value < dd->dd_phys->dd_reserved || + psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { err = ENOSPC; } mutex_exit(&dd->dd_lock); return (err); } -/* ARGSUSED */ +extern dsl_syncfunc_t dsl_prop_set_sync; + static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; + + dsl_prop_set_sync(ds, psa, tx); + DSL_PROP_CHECK_PREDICTION(dd, psa); dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); - dd->dd_phys->dd_quota = new_quota; + dd->dd_phys->dd_quota = effective_value; mutex_exit(&dd->dd_lock); - spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu ", - (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa, + tx, "%lld dataset = %llu ", + (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } int -dsl_dir_set_quota(const char *ddname, uint64_t quota) +dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_t *dd; + dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; - err = dsl_dir_open(ddname, FTAG, &dd, NULL); + dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); + + err = dsl_dataset_hold(ddname, FTAG, &ds); if (err) return (err); - if (quota != dd->dd_phys->dd_quota) { - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, dd, "a, 0); + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); } + + ASSERT(ds->ds_dir == dd); + + /* + * If someone removes a file, then tries to set the quota, we want to + * make sure the file freeing takes effect. + */ + txg_wait_open(dd->dd_pool, 0); + + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, ds, &psa, 0); + dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value; uint64_t used, avail; + int err; + + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + effective_value = psa->psa_effective_value; /* * If we are doing the preliminary check in open context, the @@ -1082,37 +1134,40 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) { - uint64_t delta = MAX(used, new_reservation) - + if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { + uint64_t delta = MAX(used, effective_value) - MAX(used, dd->dd_phys->dd_reserved); if (delta > avail) return (ENOSPC); if (dd->dd_phys->dd_quota > 0 && - new_reservation > dd->dd_phys->dd_quota) + effective_value > dd->dd_phys->dd_quota) return (ENOSPC); } return (0); } -/* ARGSUSED */ static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; uint64_t used; int64_t delta; + dsl_prop_set_sync(ds, psa, tx); + DSL_PROP_CHECK_PREDICTION(dd, psa); + dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; - delta = MAX(used, new_reservation) - + delta = MAX(used, effective_value) - MAX(used, dd->dd_phys->dd_reserved); - dd->dd_phys->dd_reserved = new_reservation; + dd->dd_phys->dd_reserved = effective_value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ @@ -1121,23 +1176,39 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } mutex_exit(&dd->dd_lock); - spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu", - (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, + tx, "%lld dataset = %llu", + (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } int -dsl_dir_set_reservation(const char *ddname, uint64_t reservation) +dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation) { dsl_dir_t *dd; + dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; - err = dsl_dir_open(ddname, FTAG, &dd, NULL); + dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); + + err = dsl_dataset_hold(ddname, FTAG, &ds); if (err) return (err); + + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + ASSERT(ds->ds_dir == dd); + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, dd, &reservation, 0); + dsl_dir_set_reservation_sync, ds, &psa, 0); + dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } @@ -1175,7 +1246,6 @@ struct renamearg { const char *mynewname; }; -/*ARGSUSED*/ static int dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1186,8 +1256,14 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) int err; uint64_t val; - /* There should be 2 references: the open and the dirty */ - if (dmu_buf_refcount(dd->dd_dbuf) > 2) + /* + * There should only be one reference, from dmu_objset_rename(). + * Fleeting holds are also possible (eg, from "zfs list" getting + * stats), but any that are present in open context will likely + * be gone by syncing context, so only fail from syncing + * context. + */ + if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1) return (EBUSY); /* check for existing name */ @@ -1216,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct renamearg *ra = arg2; @@ -1265,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dd->dd_myname, 8, 1, &dd->dd_object, tx); ASSERT3U(err, ==, 0); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, + tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); } int @@ -1315,3 +1391,26 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) return (0); } + +timestruc_t +dsl_dir_snap_cmtime(dsl_dir_t *dd) +{ + timestruc_t t; + + mutex_enter(&dd->dd_lock); + t = dd->dd_snap_cmtime; + mutex_exit(&dd->dd_lock); + + return (t); +} + +void +dsl_dir_snap_cmtime_update(dsl_dir_t *dd) +{ + timestruc_t t; + + gethrestime(&t); + mutex_enter(&dd->dd_lock); + dd->dd_snap_cmtime = t; + mutex_exit(&dd->dd_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index 0f00bc96..ea5e60d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -19,14 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include +#include #include #include +#include +#include #include #include #include @@ -36,22 +38,47 @@ #include #include #include +#include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ -int zfs_txg_synctime = 5; /* target secs to sync a txg */ +int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ uint64_t zfs_write_limit_inflated = 0; uint64_t zfs_write_limit_override = 0; -extern uint64_t zfs_write_limit_min; kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; -static int +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle); +SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN, + &zfs_no_write_throttle, 0, ""); +TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift); +SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN, + &zfs_write_limit_shift, 0, "2^N of physical memory"); +SYSCTL_DECL(_vfs_zfs_txg); +TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms); +SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN, + &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg"); + +TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN, + &zfs_write_limit_min, 0, "Minimum write limit"); +TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN, + &zfs_write_limit_max, 0, "Maximum data payload per txg"); +TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN, + &zfs_write_limit_inflated, 0, ""); +TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN, + &zfs_write_limit_override, 0, ""); + +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; @@ -89,7 +116,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_dataset_t, ds_synced_link)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 1, 4, 0); @@ -104,13 +130,13 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dsl_dir_t *dd; dsl_dataset_t *ds; - objset_impl_t *osi; + uint64_t obj; rw_enter(&dp->dp_config_rwlock, RW_WRITER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); if (err) goto out; - dp->dp_meta_objset = &osi->os; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, @@ -135,8 +161,8 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, dp, - &dp->dp_origin_snap); + ds->ds_phys->ds_prev_snap_obj, dp, + &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } dsl_dir_close(dd, dp); @@ -144,53 +170,30 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; } - /* get scrub status */ - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func); - if (err == 0) { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, - &dp->dp_scrub_bookmark); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, + &dp->dp_free_dir); if (err) goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors); + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - /* - * A new-type scrub was in progress on an old - * pool. Restart from the beginning, since the - * old software may have changed the pool in the - * meantime. - */ - dsl_pool_scrub_restart(dp); - } - } else { - /* - * It's OK if there is no scrub in progress (and if - * there was an I/O error, ignore it). - */ - err = 0; + VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, + &dp->dp_tmp_userrefs_obj); + if (err == ENOENT) + err = 0; + if (err) + goto out; + + err = dsl_scan_init(dp, txg); + out: rw_exit(&dp->dp_config_rwlock); if (err) @@ -215,23 +218,27 @@ dsl_pool_close(dsl_pool_t *dp) dsl_dataset_drop_ref(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) dsl_dir_close(dp->dp_mos_dir, dp); + if (dp->dp_free_dir) + dsl_dir_close(dp->dp_free_dir, dp); if (dp->dp_root_dir) dsl_dir_close(dp->dp_root_dir, dp); + bpobj_close(&dp->dp_free_bpobj); + /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset) - dmu_objset_evict(NULL, dp->dp_meta_objset->os); + dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); - txg_list_destroy(&dp->dp_dirty_dirs); txg_list_destroy(&dp->dp_sync_tasks); + txg_list_destroy(&dp->dp_dirty_dirs); list_destroy(&dp->dp_synced_datasets); arc_flush(dp->dp_spa); txg_fini(dp); + dsl_scan_fini(dp); rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); - mutex_destroy(&dp->dp_scrub_cancel_lock); taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); @@ -244,19 +251,22 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); - objset_impl_t *osip; + objset_t *os; dsl_dataset_t *ds; - uint64_t dsobj; + uint64_t obj; /* create and open the MOS (meta-objset) */ - dp->dp_meta_objset = &dmu_objset_create_impl(spa, - NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; + dp->dp_meta_objset = dmu_objset_create_impl(spa, + NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT3U(err, ==, 0); + /* Initialize scan structures */ + VERIFY3U(0, ==, dsl_scan_init(dp, txg)); + /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, @@ -267,18 +277,33 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) VERIFY(0 == dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + /* create and open the free dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + FREE_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* create and open the free_bplist */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); + VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); + VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* create the root dataset */ - dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); /* create the root objset */ - VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - osip = dmu_objset_create_impl(dp->dp_spa, ds, + VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); #ifdef _KERNEL - zfs_create_fs(&osip->os, kcred, zplprops, tx); + zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele(ds, FTAG); @@ -287,6 +312,14 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) return (dp); } +static int +deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { @@ -295,11 +328,19 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dir_t *dd; dsl_dataset_t *ds; dsl_sync_task_group_t *dstg; - objset_impl_t *mosi = dp->dp_meta_objset->os; + objset_t *mos = dp->dp_meta_objset; hrtime_t start, write_time; uint64_t data_written; int err; + /* + * We need to copy dp_space_towrite() before doing + * dsl_sync_task_group_sync(), because + * dsl_dataset_snapshot_reserve_space() will increase + * dp_space_towrite but not actually write anything. + */ + data_written = dp->dp_space_towrite[txg & TXG_MASK]; + tx = dmu_tx_create_assigned(dp, txg); dp->dp_read_overhead = 0; @@ -325,11 +366,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) for (ds = list_head(&dp->dp_synced_datasets); ds; ds = list_next(&dp->dp_synced_datasets, ds)) - dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx); + dmu_objset_do_userquota_updates(ds->ds_objset, tx); /* * Sync the datasets again to push out the changes due to - * userquota updates. This must be done before we process the + * userspace updates. This must be done before we process the * sync tasks, because that could cause a snapshot of a dataset * whose ds_bp will be rewritten when we do this 2nd sync. */ @@ -341,6 +382,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } err = zio_wait(zio); + /* + * Move dead blocks from the pending deadlist to the on-disk + * deadlist. + */ + for (ds = list_head(&dp->dp_synced_datasets); ds; + ds = list_next(&dp->dp_synced_datasets, ds)) { + bplist_iterate(&ds->ds_pending_deadlist, + deadlist_enqueue_cb, &ds->ds_deadlist, tx); + } + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { /* * No more sync tasks should have been added while we @@ -356,14 +407,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; - if (spa_sync_pass(dp->dp_spa) == 1) - dsl_pool_scrub_sync(dp, tx); - start = gethrtime(); - if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || - list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { + if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || + list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dmu_objset_sync(mosi, zio, tx); + dmu_objset_sync(mos, zio, tx); err = zio_wait(zio); ASSERT(err == 0); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); @@ -376,7 +424,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dmu_tx_commit(tx); - data_written = dp->dp_space_towrite[txg & TXG_MASK]; dp->dp_space_towrite[txg & TXG_MASK] = 0; ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); @@ -401,10 +448,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * amount of write traffic allowed into each transaction group. * Weight the throughput calculation towards the current value: * thru = 3/4 old_thru + 1/4 new_thru + * + * Note: write_time is in nanosecs, so write_time/MICROSEC + * yields millisecs */ ASSERT(zfs_write_limit_min > 0); - if (data_written > zfs_write_limit_min / 8 && write_time > 0) { - uint64_t throughput = (data_written * NANOSEC) / write_time; + if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { + uint64_t throughput = data_written / (write_time / MICROSEC); + if (dp->dp_throughput) dp->dp_throughput = throughput / 4 + 3 * dp->dp_throughput / 4; @@ -412,21 +463,24 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dp->dp_throughput = throughput; dp->dp_write_limit = MIN(zfs_write_limit_inflated, MAX(zfs_write_limit_min, - dp->dp_throughput * zfs_txg_synctime)); + dp->dp_throughput * zfs_txg_synctime_ms)); } } void -dsl_pool_zil_clean(dsl_pool_t *dp) +dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { dsl_dataset_t *ds; + objset_t *os; while (ds = list_head(&dp->dp_synced_datasets)) { list_remove(&dp->dp_synced_datasets, ds); - ASSERT(ds->ds_user_ptr != NULL); - zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); + os = ds->ds_objset; + zil_clean(os->os_zil, txg); + ASSERT(!dmu_objset_is_dirty(os, txg)); dmu_buf_rele(ds->ds_dbuf, ds); } + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* @@ -627,6 +681,65 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) tx, DS_FIND_CHILDREN)); } +/* ARGSUSED */ +static int +upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + if (ds->ds_dir->dd_phys->dd_origin_obj) { + dsl_dataset_t *origin; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); + + if (origin->ds_dir->dd_phys->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + origin->ds_dir->dd_phys->dd_clones = zap_create(mos, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); + + dsl_dataset_rele(origin, FTAG); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t obj; + + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* + * We can't use bpobj_alloc(), because spa_version() still + * returns the old version, and we need a new-version bpobj with + * subobj support. So call dmu_object_alloc() directly. + */ + obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, + SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); + VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + + VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, + upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); +} + void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { @@ -641,7 +754,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); + dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); @@ -653,3 +766,108 @@ dsl_pool_vnrele_taskq(dsl_pool_t *dp) { return (dp->dp_vnrele_taskq); } + +/* + * Walk through the pool-wide zap object of temporary snapshot user holds + * and release them. + */ +void +dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) +{ + zap_attribute_t za; + zap_cursor_t zc; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + + if (zapobj == 0) + return; + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + char *htag; + uint64_t dsobj; + + htag = strchr(za.za_name, '-'); + *htag = '\0'; + ++htag; + dsobj = strtonum(za.za_name, NULL); + (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); + } + zap_cursor_fini(&zc); +} + +/* + * Create the pool-wide zap object for storing temporary snapshot holds. + */ +void +dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + + ASSERT(dp->dp_tmp_userrefs_obj == 0); + ASSERT(dmu_tx_is_syncing(tx)); + + dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, + DMU_OT_NONE, 0, tx); + + VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, + sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); +} + +static int +dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) +{ + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + char *name; + int error; + + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + ASSERT(dmu_tx_is_syncing(tx)); + + /* + * If the pool was created prior to SPA_VERSION_USERREFS, the + * zap object for temporary holds might not exist yet. + */ + if (zapobj == 0) { + if (holding) { + dsl_pool_user_hold_create_obj(dp, tx); + zapobj = dp->dp_tmp_userrefs_obj; + } else { + return (ENOENT); + } + } + + name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); + if (holding) + error = zap_add(mos, zapobj, name, 8, 1, now, tx); + else + error = zap_remove(mos, zapobj, name, tx); + strfree(name); + + return (error); +} + +/* + * Add a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + uint64_t *now, dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); +} + +/* + * Release a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, + tx, B_FALSE)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c index d064932..aa66b32 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c @@ -19,10 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +#include #include #include #include @@ -31,14 +31,16 @@ #include #include #include -#include /* for the default checksum value */ #include #include #include "zfs_prop.h" +#define ZPROP_INHERIT_SUFFIX "$inherit" +#define ZPROP_RECVD_SUFFIX "$recvd" + static int -dodefault(const char *propname, int intsz, int numint, void *buf) +dodefault(const char *propname, int intsz, int numints, void *buf) { zfs_prop_t prop; @@ -55,9 +57,9 @@ dodefault(const char *propname, int intsz, int numint, void *buf) if (intsz != 1) return (EOVERFLOW); (void) strncpy(buf, zfs_prop_default_string(prop), - numint); + numints); } else { - if (intsz != 8 || numint < 1) + if (intsz != 8 || numints < 1) return (EOVERFLOW); *(uint64_t *)buf = zfs_prop_default_numeric(prop); @@ -68,11 +70,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf) int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numint, void *buf, char *setpoint) + int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) { int err = ENOENT; + dsl_dir_t *target = dd; objset_t *mos = dd->dd_pool->dp_meta_objset; zfs_prop_t prop; + boolean_t inheritable; + boolean_t inheriting = B_FALSE; + char *inheritstr; + char *recvdstr; ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); @@ -80,51 +87,135 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, setpoint[0] = '\0'; prop = zfs_name_to_prop(propname); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); /* - * Note: dd may be NULL, therefore we shouldn't dereference it - * ouside this loop. + * Note: dd may become NULL, therefore we shouldn't dereference it + * after this loop. */ for (; dd != NULL; dd = dd->dd_parent) { ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, - propname, intsz, numint, buf); + + if (dd != target || snapshot) { + if (!inheritable) + break; + inheriting = B_TRUE; + } + + /* Check for a local value. */ + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, + intsz, numints, buf); if (err != ENOENT) { - if (setpoint) + if (setpoint != NULL && err == 0) dsl_dir_name(dd, setpoint); break; } /* - * Break out of this loop for non-inheritable properties. + * Skip the check for a received value if there is an explicit + * inheritance entry. */ - if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, + inheritstr); + if (err != 0 && err != ENOENT) break; + + if (err == ENOENT) { + /* Check for a received value. */ + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, + recvdstr, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) { + if (inheriting) { + dsl_dir_name(dd, setpoint); + } else { + (void) strcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD); + } + } + break; + } + } + + /* + * If we found an explicit inheritance entry, err is zero even + * though we haven't yet found the value, so reinitializing err + * at the end of the loop (instead of at the beginning) ensures + * that err has a valid post-loop value. + */ + err = ENOENT; } + if (err == ENOENT) - err = dodefault(propname, intsz, numint, buf); + err = dodefault(propname, intsz, numints, buf); + + strfree(inheritstr); + strfree(recvdstr); return (err); } int dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, - int intsz, int numint, void *buf, char *setpoint) + int intsz, int numints, void *buf, char *setpoint) { + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t inheritable; + boolean_t snapshot; + uint64_t zapobj; + ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)); + zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj); + + if (zapobj != 0) { + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int err; - if (ds->ds_phys->ds_props_obj) { - int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_props_obj, propname, intsz, numint, buf); + ASSERT(snapshot); + + /* Check for a local value. */ + err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); if (err != ENOENT) { - if (setpoint) + if (setpoint != NULL && err == 0) dsl_dataset_name(ds, setpoint); return (err); } + + /* + * Skip the check for a received value if there is an explicit + * inheritance entry. + */ + if (inheritable) { + char *inheritstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, zapobj, inheritstr); + strfree(inheritstr); + if (err != 0 && err != ENOENT) + return (err); + } + + if (err == ENOENT) { + /* Check for a received value. */ + char *recvdstr = kmem_asprintf("%s%s", propname, + ZPROP_RECVD_SUFFIX); + err = zap_lookup(mos, zapobj, recvdstr, + intsz, numints, buf); + strfree(recvdstr); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + (void) strcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD); + return (err); + } + } } return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numint, buf, setpoint)); + intsz, numints, buf, setpoint, snapshot)); } /* @@ -168,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, cbr->cbr_func(cbr->cbr_arg, value); - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, - NULL, cbr, &dd)); if (need_rwlock) rw_exit(&dp->dp_config_rwlock); - /* Leave dir open until this callback is unregistered */ return (0); } @@ -210,6 +298,137 @@ dsl_prop_get_integer(const char *ddname, const char *propname, return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); } +void +dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, + zprop_source_t source, uint64_t *value) +{ + psa->psa_name = propname; + psa->psa_source = source; + psa->psa_intsz = 8; + psa->psa_numints = 1; + psa->psa_value = value; + + psa->psa_effective_value = -1ULL; +} + +/* + * Predict the effective value of the given special property if it were set with + * the given value and source. This is not a general purpose function. It exists + * only to handle the special requirements of the quota and reservation + * properties. The fact that these properties are non-inheritable greatly + * simplifies the prediction logic. + * + * Returns 0 on success, a positive error code on failure, or -1 if called with + * a property not handled by this function. + */ +int +dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +{ + const char *propname = psa->psa_name; + zfs_prop_t prop = zfs_name_to_prop(propname); + zprop_source_t source = psa->psa_source; + objset_t *mos; + uint64_t zapobj; + uint64_t version; + char *recvdstr; + int err = 0; + + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_REFRESERVATION: + break; + default: + return (-1); + } + + mos = dd->dd_pool->dp_meta_objset; + zapobj = dd->dd_phys->dd_props_zapobj; + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + version = spa_version(dd->dd_pool->dp_spa); + if (version < SPA_VERSION_RECVD_PROPS) { + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + switch (source) { + case ZPROP_SRC_NONE: + /* Revert to the received value, if any. */ + err = zap_lookup(mos, zapobj, recvdstr, 8, 1, + &psa->psa_effective_value); + if (err == ENOENT) + psa->psa_effective_value = 0; + break; + case ZPROP_SRC_LOCAL: + psa->psa_effective_value = *(uint64_t *)psa->psa_value; + break; + case ZPROP_SRC_RECEIVED: + /* + * If there's no local setting, then the new received value will + * be the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, + &psa->psa_effective_value); + if (err == ENOENT) + psa->psa_effective_value = *(uint64_t *)psa->psa_value; + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * We're clearing the received value, so the local setting (if + * it exists) remains the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, + &psa->psa_effective_value); + if (err == ENOENT) + psa->psa_effective_value = 0; + break; + default: + cmn_err(CE_PANIC, "unexpected property source: %d", source); + } + + strfree(recvdstr); + + if (err == ENOENT) + return (0); + + return (err); +} + +#ifdef ZFS_DEBUG +void +dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +{ + zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); + uint64_t intval; + char setpoint[MAXNAMELEN]; + uint64_t version = spa_version(dd->dd_pool->dp_spa); + int err; + + if (version < SPA_VERSION_RECVD_PROPS) { + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_RESERVATION: + return; + } + } + + err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, + setpoint, B_FALSE); + if (err == 0 && intval != psa->psa_effective_value) { + cmn_err(CE_PANIC, "%s property, source: %x, " + "predicted effective value: %llu, " + "actual effective value: %llu (setpoint: %s)", + psa->psa_name, psa->psa_source, + (unsigned long long)psa->psa_effective_value, + (unsigned long long)intval, setpoint); + } +} +#endif + /* * Unregister this callback. Return 0 on success, ENOENT if ddname is * invalid, ENOMSG if no matching callback registered. @@ -241,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); - /* Clean up from dsl_prop_register */ - dsl_dir_close(dd, cbr); return (0); } @@ -277,7 +494,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, zap_cursor_t zc; zap_attribute_t *za; int err; - uint64_t dummyval; ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); @@ -289,8 +505,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, * If the prop is set here, then this change is not * being inherited here or below; stop the recursion. */ - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, - 8, 1, &dummyval); + err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname); if (err == 0) { dsl_dir_close(dd, FTAG); return; @@ -310,8 +525,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, * If the property is set on this ds, then it is not * inherited here; don't call the callback. */ - if (propobj && 0 == zap_lookup(mos, propobj, propname, - 8, 1, &dummyval)) + if (propobj && 0 == zap_contains(mos, propobj, propname)) continue; cbr->cbr_func(cbr->cbr_arg, value); @@ -331,30 +545,28 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_close(dd, FTAG); } -struct prop_set_arg { - const char *name; - int intsz; - int numints; - const void *buf; -}; - - -static void -dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +void +dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - struct prop_set_arg *psa = arg2; + dsl_prop_setarg_t *psa = arg2; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t zapobj, intval; + uint64_t zapobj, intval, dummy; int isint; char valbuf[32]; - char *valstr; + char *valstr = NULL; + char *inheritstr; + char *recvdstr; + char *tbuf = NULL; + int err; + uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); + const char *propname = psa->psa_name; + zprop_source_t source = psa->psa_source; - isint = (dodefault(psa->name, 8, 1, &intval) == 0); + isint = (dodefault(propname, 8, 1, &intval) == 0); - if (dsl_dataset_is_snapshot(ds)) { - ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >= - SPA_VERSION_SNAP_PROPS); + if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { + ASSERT(version >= SPA_VERSION_SNAP_PROPS); if (ds->ds_phys->ds_props_obj == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_props_obj = @@ -366,22 +578,97 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; } - if (psa->numints == 0) { - int err = zap_remove(mos, zapobj, psa->name, tx); + if (version < SPA_VERSION_RECVD_PROPS) { + zfs_prop_t prop = zfs_name_to_prop(propname); + if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) + return; + + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + switch (source) { + case ZPROP_SRC_NONE: + /* + * revert to received value, if any (inherit -S) + * - remove propname + * - remove propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + case ZPROP_SRC_LOCAL: + /* + * remove propname$inherit + * set propname -> value + */ + err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); - if (isint) { - VERIFY(0 == dsl_prop_get_ds(ds, - psa->name, 8, 1, &intval, NULL)); + VERIFY(0 == zap_update(mos, zapobj, propname, + psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); + break; + case ZPROP_SRC_INHERITED: + /* + * explicitly inherit + * - remove propname + * - set propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + if (version >= SPA_VERSION_RECVD_PROPS && + dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, + NULL) == 0) { + dummy = 0; + err = zap_update(mos, zapobj, inheritstr, + 8, 1, &dummy, tx); + ASSERT(err == 0); } - } else { - VERIFY(0 == zap_update(mos, zapobj, psa->name, - psa->intsz, psa->numints, psa->buf, tx)); - if (isint) - intval = *(uint64_t *)psa->buf; + break; + case ZPROP_SRC_RECEIVED: + /* + * set propname$recvd -> value + */ + err = zap_update(mos, zapobj, recvdstr, + psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); + ASSERT(err == 0); + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): + /* + * clear local and received settings + * - remove propname + * - remove propname$inherit + * - remove propname$recvd + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + /* FALLTHRU */ + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * remove propname$recvd + */ + err = zap_remove(mos, zapobj, recvdstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + default: + cmn_err(CE_PANIC, "unexpected property source: %d", source); } + strfree(inheritstr); + strfree(recvdstr); + if (isint) { - if (dsl_dataset_is_snapshot(ds)) { + VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); + + if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this @@ -392,58 +679,85 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { if (cbr->cbr_ds == ds && - strcmp(cbr->cbr_propname, psa->name) == 0) + strcmp(cbr->cbr_propname, propname) == 0) cbr->cbr_func(cbr->cbr_arg, intval); } mutex_exit(&ds->ds_dir->dd_lock); } else { dsl_prop_changed_notify(ds->ds_dir->dd_pool, - ds->ds_dir->dd_object, psa->name, intval, TRUE); + ds->ds_dir->dd_object, propname, intval, TRUE); } - } - if (isint) { + (void) snprintf(valbuf, sizeof (valbuf), "%lld", (longlong_t)intval); valstr = valbuf; } else { - valstr = (char *)psa->buf; + if (source == ZPROP_SRC_LOCAL) { + valstr = (char *)psa->psa_value; + } else { + tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + if (dsl_prop_get_ds(ds, propname, 1, + ZAP_MAXVALUELEN, tbuf, NULL) == 0) + valstr = tbuf; + } } - spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT : - LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, - "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object); + + spa_history_log_internal((source == ZPROP_SRC_NONE || + source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : + LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, + "%s=%s dataset = %llu", propname, + (valstr == NULL ? "" : valstr), ds->ds_object); + + if (tbuf != NULL) + kmem_free(tbuf, ZAP_MAXVALUELEN); } void -dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - nvlist_t *nvl = arg2; + dsl_props_arg_t *pa = arg2; + nvlist_t *props = pa->pa_props; + dsl_prop_setarg_t psa; nvpair_t *elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - struct prop_set_arg psa; + psa.psa_source = pa->pa_source; - psa.name = nvpair_name(elem); + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + nvpair_t *pair = elem; - if (nvpair_type(elem) == DATA_TYPE_STRING) { - VERIFY(nvpair_value_string(elem, - (char **)&psa.buf) == 0); - psa.intsz = 1; - psa.numints = strlen(psa.buf) + 1; + psa.psa_name = nvpair_name(pair); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + if (nvpair_type(pair) == DATA_TYPE_STRING) { + VERIFY(nvpair_value_string(pair, + (char **)&psa.psa_value) == 0); + psa.psa_intsz = 1; + psa.psa_numints = strlen(psa.psa_value) + 1; } else { uint64_t intval; - VERIFY(nvpair_value_uint64(elem, &intval) == 0); - psa.intsz = sizeof (intval); - psa.numints = 1; - psa.buf = &intval; + VERIFY(nvpair_value_uint64(pair, &intval) == 0); + psa.psa_intsz = sizeof (intval); + psa.psa_numints = 1; + psa.psa_value = &intval; } - dsl_prop_set_sync(ds, &psa, cr, tx); + dsl_prop_set_sync(ds, &psa, tx); } } void -dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx) +dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t zapobj = dd->dd_phys->dd_props_zapobj; @@ -454,18 +768,19 @@ dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); - spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, "%s=%llu dataset = %llu", name, (u_longlong_t)val, dd->dd_phys->dd_head_dataset_obj); } int -dsl_prop_set(const char *dsname, const char *propname, +dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, int intsz, int numints, const void *buf) { dsl_dataset_t *ds; + uint64_t version; int err; - struct prop_set_arg psa; + dsl_prop_setarg_t psa; /* * We must do these checks before we get to the syncfunc, since @@ -473,23 +788,30 @@ dsl_prop_set(const char *dsname, const char *propname, */ if (strlen(propname) >= ZAP_MAXNAMELEN) return (ENAMETOOLONG); - if (intsz * numints >= ZAP_MAXVALUELEN) - return (E2BIG); err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); + return (E2BIG); + } if (dsl_dataset_is_snapshot(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } - psa.name = propname; - psa.intsz = intsz; - psa.numints = numints; - psa.buf = buf; + psa.psa_name = propname; + psa.psa_source = source; + psa.psa_intsz = intsz; + psa.psa_numints = numints; + psa.psa_value = buf; + psa.psa_effective_value = -1ULL; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, dsl_prop_set_sync, ds, &psa, 2); @@ -498,158 +820,318 @@ dsl_prop_set(const char *dsname, const char *propname, } int -dsl_props_set(const char *dsname, nvlist_t *nvl) +dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) { dsl_dataset_t *ds; + uint64_t version; nvpair_t *elem = NULL; + dsl_props_arg_t pa; int err; + if (err = dsl_dataset_hold(dsname, FTAG, &ds)) + return (err); /* * Do these checks before the syncfunc, since it can't fail. */ - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + dsl_dataset_rele(ds, FTAG); return (ENAMETOOLONG); + } if (nvpair_type(elem) == DATA_TYPE_STRING) { char *valstr; VERIFY(nvpair_value_string(elem, &valstr) == 0); - if (strlen(valstr) >= ZAP_MAXVALUELEN) + if (strlen(valstr) >= (version < + SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); return (E2BIG); + } } } - if (err = dsl_dataset_hold(dsname, FTAG, &ds)) - return (err); - if (dsl_dataset_is_snapshot(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } + pa.pa_props = props; + pa.pa_source = source; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, - NULL, dsl_props_set_sync, ds, nvl, 2); + NULL, dsl_props_set_sync, ds, &pa, 2); dsl_dataset_rele(ds, FTAG); return (err); } +typedef enum dsl_prop_getflags { + DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ + DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ + DSL_PROP_GET_LOCAL = 0x4, /* local properties */ + DSL_PROP_GET_RECEIVED = 0x8 /* received properties */ +} dsl_prop_getflags_t; + +static int +dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, + const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err = 0; + + for (zap_cursor_init(&zc, mos, propobj); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + nvlist_t *propval; + zfs_prop_t prop; + char buf[ZAP_MAXNAMELEN]; + char *valstr; + const char *suffix; + const char *propname; + const char *source; + + suffix = strchr(za.za_name, '$'); + + if (suffix == NULL) { + /* + * Skip local properties if we only want received + * properties. + */ + if (flags & DSL_PROP_GET_RECEIVED) + continue; + + propname = za.za_name; + source = setpoint; + } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { + /* Skip explicitly inherited entries. */ + continue; + } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { + if (flags & DSL_PROP_GET_LOCAL) + continue; + + (void) strncpy(buf, za.za_name, (suffix - za.za_name)); + buf[suffix - za.za_name] = '\0'; + propname = buf; + + if (!(flags & DSL_PROP_GET_RECEIVED)) { + /* Skip if locally overridden. */ + err = zap_contains(mos, propobj, propname); + if (err == 0) + continue; + if (err != ENOENT) + break; + + /* Skip if explicitly inherited. */ + valstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, propobj, valstr); + strfree(valstr); + if (err == 0) + continue; + if (err != ENOENT) + break; + } + + source = ((flags & DSL_PROP_GET_INHERITING) ? + setpoint : ZPROP_SOURCE_VAL_RECVD); + } else { + /* + * For backward compatibility, skip suffixes we don't + * recognize. + */ + continue; + } + + prop = zfs_name_to_prop(propname); + + /* Skip non-inheritable properties. */ + if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && + !zfs_prop_inheritable(prop)) + continue; + + /* Skip properties not valid for this type. */ + if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) + continue; + + /* Skip properties already defined. */ + if (nvlist_exists(nv, propname)) + continue; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (za.za_integer_length == 1) { + /* + * String property + */ + char *tmp = kmem_alloc(za.za_num_integers, + KM_SLEEP); + err = zap_lookup(mos, propobj, + za.za_name, 1, za.za_num_integers, tmp); + if (err != 0) { + kmem_free(tmp, za.za_num_integers); + break; + } + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, + tmp) == 0); + kmem_free(tmp, za.za_num_integers); + } else { + /* + * Integer property + */ + ASSERT(za.za_integer_length == 8); + (void) nvlist_add_uint64(propval, ZPROP_VALUE, + za.za_first_integer); + } + + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); + } + zap_cursor_fini(&zc); + if (err == ENOENT) + err = 0; + return (err); +} + /* * Iterate over all properties for this dataset and return them in an nvlist. */ -int -dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local) +static int +dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, + dsl_prop_getflags_t flags) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; dsl_dir_t *dd = ds->ds_dir; - boolean_t snapshot = dsl_dataset_is_snapshot(ds); - int err = 0; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; - uint64_t propobj = ds->ds_phys->ds_props_obj; + int err = 0; + char setpoint[MAXNAMELEN]; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (local && snapshot && !propobj) - return (0); + if (dsl_dataset_is_snapshot(ds)) + flags |= DSL_PROP_GET_SNAPSHOT; rw_enter(&dp->dp_config_rwlock, RW_READER); - while (dd != NULL) { - char setpoint[MAXNAMELEN]; - zap_cursor_t zc; - zap_attribute_t za; - dsl_dir_t *dd_next; - - if (propobj) { - dsl_dataset_name(ds, setpoint); - dd_next = dd; - } else { - dsl_dir_name(dd, setpoint); - propobj = dd->dd_phys->dd_props_zapobj; - dd_next = dd->dd_parent; + + if (ds->ds_phys->ds_props_obj != 0) { + ASSERT(flags & DSL_PROP_GET_SNAPSHOT); + dsl_dataset_name(ds, setpoint); + err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj, + setpoint, flags, *nvp); + if (err) + goto out; + } + + for (; dd != NULL; dd = dd->dd_parent) { + if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { + if (flags & (DSL_PROP_GET_LOCAL | + DSL_PROP_GET_RECEIVED)) + break; + flags |= DSL_PROP_GET_INHERITING; } + dsl_dir_name(dd, setpoint); + err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj, + setpoint, flags, *nvp); + if (err) + break; + } +out: + rw_exit(&dp->dp_config_rwlock); + return (err); +} - for (zap_cursor_init(&zc, mos, propobj); - (err = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - nvlist_t *propval; - zfs_prop_t prop = zfs_name_to_prop(za.za_name); +boolean_t +dsl_prop_get_hasrecvd(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + int rc; + uint64_t dummy; - /* Skip non-inheritable properties. */ - if (prop != ZPROP_INVAL && - !zfs_prop_inheritable(prop) && - (dd != ds->ds_dir || (snapshot && dd != dd_next))) - continue; + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); + return (rc == 0); +} - /* Skip properties not valid for this type. */ - if (snapshot && prop != ZPROP_INVAL && - !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) - continue; +static void +dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + uint64_t dummy = 0; + dsl_prop_setarg_t psa; - /* Skip properties already defined */ - if (nvlist_lookup_nvlist(*nvp, za.za_name, - &propval) == 0) - continue; + if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) + return; - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - if (za.za_integer_length == 1) { - /* - * String property - */ - char *tmp = kmem_alloc(za.za_num_integers, - KM_SLEEP); - err = zap_lookup(mos, propobj, - za.za_name, 1, za.za_num_integers, tmp); - if (err != 0) { - kmem_free(tmp, za.za_num_integers); - break; - } - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, - tmp) == 0); - kmem_free(tmp, za.za_num_integers); - } else { - /* - * Integer property - */ - ASSERT(za.za_integer_length == 8); - (void) nvlist_add_uint64(propval, ZPROP_VALUE, - za.za_first_integer); - } + dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); - VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, - setpoint) == 0); - VERIFY(nvlist_add_nvlist(*nvp, za.za_name, - propval) == 0); - nvlist_free(propval); - } - zap_cursor_fini(&zc); + (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, + dsl_prop_set_sync, ds, &psa, 2); +} - if (err != ENOENT) - break; - err = 0; - /* - * If we are just after the props that have been set - * locally, then we are done after the first iteration. - */ - if (local) - break; - dd = dd_next; - propobj = 0; +/* + * Call after successfully receiving properties to ensure that only the first + * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. + */ +void +dsl_prop_set_hasrecvd(objset_t *os) +{ + if (dsl_prop_get_hasrecvd(os)) { + ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); + return; } - rw_exit(&dp->dp_config_rwlock); + dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); +} - return (err); +void +dsl_prop_unset_hasrecvd(objset_t *os) +{ + dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); +} + +int +dsl_prop_get_all(objset_t *os, nvlist_t **nvp) +{ + return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); +} + +int +dsl_prop_get_received(objset_t *os, nvlist_t **nvp) +{ + /* + * Received properties are not distinguishable from local properties + * until the dataset has received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? + DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); + return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); } void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) { nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + uint64_t default_value; + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + return; + } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + /* Indicate the default source if we can. */ + if (dodefault(propname, 8, 1, &default_value) == 0 && + value == default_value) { + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); + } + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } @@ -657,9 +1139,15 @@ void dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) { nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + return; + } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c new file mode 100644 index 0000000..56d4108 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -0,0 +1,1766 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); + +static scan_cb_t dsl_scan_defrag_cb; +static scan_cb_t dsl_scan_scrub_cb; +static scan_cb_t dsl_scan_remove_cb; +static dsl_syncfunc_t dsl_scan_cancel_sync; +static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); + +int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ +int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ +int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +int zfs_scan_idle = 50; /* idle window in clock ticks */ + +int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ +int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ + +#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ + (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + +extern int zfs_txg_timeout; + +/* the order has to match pool_scan_type */ +static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { + NULL, + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ +}; + +int +dsl_scan_init(dsl_pool_t *dp, uint64_t txg) +{ + int err; + dsl_scan_t *scn; + spa_t *spa = dp->dp_spa; + uint64_t f; + + scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); + scn->scn_dp = dp; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_func", sizeof (uint64_t), 1, &f); + if (err == 0) { + /* + * There was an old-style scrub in progress. Restart a + * new-style scrub from the beginning. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("old-style scrub was in progress; " + "restarting new-style scrub in txg %llu", + scn->scn_restart_txg); + + /* + * Load the queue obj from the old location so that it + * can be freed by dsl_scan_done(). + */ + (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_queue", sizeof (uint64_t), 1, + &scn->scn_phys.scn_queue_obj); + } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + if (scn->scn_phys.scn_state == DSS_SCANNING && + spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { + /* + * A new-type scrub was in progress on an old + * pool, and the pool was accessed by old + * software. Restart from the beginning, since + * the old software may have changed the pool in + * the meantime. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("new-style scrub was modified " + "by old software; restarting in txg %llu", + scn->scn_restart_txg); + } + } + + spa_scan_stat_init(spa); + return (0); +} + +void +dsl_scan_fini(dsl_pool_t *dp) +{ + if (dp->dp_scan) { + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); + dp->dp_scan = NULL; + } +} + +/* ARGSUSED */ +static int +dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (EBUSY); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + pool_scan_func_t *funcp = arg2; + dmu_object_type_t ot = 0; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_restart_txg = 0; + spa_scan_stat_init(spa); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; + + /* rewrite all disk labels */ + vdev_config_dirty(spa->spa_root_vdev); + + if (vdev_resilver_needed(spa->spa_root_vdev, + &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); + } else { + spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); + } + + spa->spa_scrub_started = B_TRUE; + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (scn->scn_phys.scn_min_txg > TXG_INITIAL) + scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); + + dsl_scan_sync_state(scn, tx); + + spa_history_log_internal(LOG_POOL_SCAN, spa, tx, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); +} + +/* ARGSUSED */ +static void +dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + static const char *old_names[] = { + "scrub_bookmark", + "scrub_ddt_bookmark", + "scrub_ddt_class_max", + "scrub_queue", + "scrub_min_txg", + "scrub_max_txg", + "scrub_func", + "scrub_errors", + NULL + }; + + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + int i; + + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } + + /* + * If we were "restarted" from a stopped state, don't bother + * with anything else. + */ + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (complete) + scn->scn_phys.scn_state = DSS_FINISHED; + else + scn->scn_phys.scn_state = DSS_CANCELED; + + spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, + "complete=%u", complete); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + spa->spa_scrub_started = B_FALSE; + spa->spa_scrub_active = B_FALSE; + + /* + * If the scrub/resilver completed, update all DTLs to + * reflect this. Whether it succeeded or not, vacate + * all temporary scrub DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); + if (complete) { + spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + } + spa_errlog_rotate(spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + } + + scn->scn_phys.scn_end_time = gethrestime_sec(); +} + +/* ARGSUSED */ +static int +dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return (ENOENT); + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + dsl_scan_done(scn, B_FALSE, tx); + dsl_scan_sync_state(scn, tx); +} + +int +dsl_scan_cancel(dsl_pool_t *dp) +{ + boolean_t complete = B_FALSE; + int err; + + err = dsl_sync_task_do(dp, dsl_scan_cancel_check, + dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); + return (err); +} + +static void dsl_scan_visitbp(blkptr_t *bp, + const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx); +static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, + dmu_objset_type_t ostype, + dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); + +void +dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) +{ + zio_free(dp->dp_spa, txg, bp); +} + +void +dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) +{ + ASSERT(dsl_pool_sync_context(dp)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); +} + +int +dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + return (arc_read(pio, spa, bpp, pbuf, done, private, + priority, zio_flags, arc_flags, zb)); +} + +int +dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + return (arc_read_nolock(pio, spa, bpp, done, private, + priority, zio_flags, arc_flags, zb)); +} + +static boolean_t +bookmark_is_zero(const zbookmark_t *zb) +{ + return (zb->zb_objset == 0 && zb->zb_object == 0 && + zb->zb_level == 0 && zb->zb_blkid == 0); +} + +/* dnp is the dnode for zb1->zb_object */ +static boolean_t +bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +{ + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (dsl_dataset_is_snapshot(ds)) + return (MIN(smt, ds->ds_phys->ds_creation_txg)); + return (smt); +} + +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); +} + +static boolean_t +dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) +{ + uint64_t elapsed_nanosecs; + int mintime; + + /* we never skip user/group accounting objects */ + if (zb && (int64_t)zb->zb_object < 0) + return (B_FALSE); + + if (scn->scn_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb && zb->zb_level != 0) + return (B_FALSE); + + mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scan_min_time_ms; + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > mintime && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + scn->scn_phys.scn_bookmark = *zb; + } + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + scn->scn_pausing = B_TRUE; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_scan_arg { + dsl_pool_t *zsa_dp; + zil_header_t *zsa_zh; +} zil_scan_arg_t; + +/* ARGSUSED */ +static int +dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * scrub there's nothing to do to it). + */ + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + return (0); +} + +/* ARGSUSED */ +static int +dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + } + return (0); +} + +static void +dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_scan_arg_t zsa = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, + claim_txg); + + zil_free(zilog); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, + uint64_t objset, uint64_t object, uint64_t blkid) +{ + zbookmark_t czb; + uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + + if (zfs_no_scrub_prefetch) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + return; + + SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + + /* + * XXX need to make sure all of these arc_read() prefetches are + * done before setting xlateall (similar to dsl_read()) + */ + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); +} + +static boolean_t +dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, + const zbookmark_t *zb) +{ + /* + * We never skip over user/group accounting objects (obj<0) + */ + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + (int64_t)zb->zb_object >= 0) { + /* + * If we already visited this bp & everything below (in + * a prior txg sync), don't bother doing it again. + */ + if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + return (B_TRUE); + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + } + } + return (B_FALSE); +} + +/* + * Return nonzero on i/o error. + * Return new buf to write out in *bufp. + */ +static int +dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) +{ + dsl_pool_t *dp = scn->scn_dp; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + int err; + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, + zb->zb_object, zb->zb_blkid * epb + i); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + dsl_scan_visitbp(cbp, &czb, dnp, + *bufp, ds, scn, ostype, tx); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { + uint32_t flags = ARC_WAIT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + dnode_phys_t *cdnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + for (j = 0; j < cdnp->dn_nblkptr; j++) { + blkptr_t *cbp = &cdnp->dn_blkptr[j]; + dsl_scan_prefetch(scn, *bufp, cbp, + zb->zb_objset, zb->zb_blkid * epb + i, j); + } + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + dsl_scan_visitdnode(scn, ds, ostype, + cdnp, *bufp, zb->zb_blkid * epb + i, tx); + } + + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + + osp = (*bufp)->b_data; + + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); + + if (OBJSET_BUF_HAS_USERUSED(*bufp)) { + /* + * We also always visit user/group accounting + * objects, and never skip them, even if we are + * pausing. This is necessary so that the space + * deltas from this txg get integrated. + */ + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_groupused_dnode, *bufp, + DMU_GROUPUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_userused_dnode, *bufp, + DMU_USERUSED_OBJECT, tx); + } + } + + return (0); +} + +static void +dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, + dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t object, dmu_tx_t *tx) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + dnp->dn_nlevels - 1, j); + dsl_scan_visitbp(&dnp->dn_blkptr[j], + &czb, dnp, buf, ds, scn, ostype, tx); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_t czb; + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + 0, DMU_SPILL_BLKID); + dsl_scan_visitbp(&dnp->dn_spill, + &czb, dnp, buf, ds, scn, ostype, tx); + } +} + +/* + * The arguments are in this order because mdb can only print the + * first 5; we want them to be useful. + */ +static void +dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, + dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + arc_buf_t *buf = NULL; + blkptr_t bp_toread = *bp; + + /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ + + if (dsl_scan_check_pause(scn, zb)) + return; + + if (dsl_scan_check_resume(scn, dnp, zb)) + return; + + if (bp->blk_birth == 0) + return; + + scn->scn_visited_this_txg++; + + dprintf_bp(bp, + "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p", + ds, ds ? ds->ds_object : 0, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + pbuf, bp); + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return; + + if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { + /* + * For non-user-accounting blocks, we need to read the + * new bp (from a deleted snapshot, found in + * check_existing_xlation). If we used the old bp, + * pointers inside this block from before we resumed + * would be untranslated. + * + * For user-accounting blocks, we need to read the old + * bp, because we will apply the entire space delta to + * it (original untranslated -> translations from + * deleted snap -> now). + */ + bp_toread = *bp; + } + + if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, + &buf) != 0) + return; + + /* + * If dsl_scan_ddt() has aready visited this block, it will have + * already done any translations or scrubbing, so don't call the + * callback again. + */ + if (ddt_class_contains(dp->dp_spa, + scn->scn_phys.scn_ddt_class_max, bp)) { + ASSERT(buf == NULL); + return; + } + + /* + * If this block is from the future (after cur_max_txg), then we + * are doing this on behalf of a deleted snapshot, and we will + * revisit the future block on the next pass of this dataset. + * Don't scan it now unless we need to because something + * under it was modified. + */ + if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + } + if (buf) + (void) arc_buf_remove_ref(buf, &buf); +} + +static void +dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, + dmu_tx_t *tx) +{ + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + dsl_scan_visitbp(bp, &zb, NULL, NULL, + ds, scn, DMU_OST_NONE, tx); + + dprintf_ds(ds, "finished scan%s", ""); +} + +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (dsl_dataset_is_snapshot(ds)) { + /* Note, scn_cur_{min,max}_txg stays the same. */ + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_next_snap_obj; + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + } else { + SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object); + } + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + if (dsl_dataset_is_snapshot(ds)) { + /* + * We keep the same mintxg; it could be > + * ds_creation_txg if the previous snapshot was + * deleted too. + */ + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("destroying ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + } else { + zfs_dbgmsg("destroying ds %llu; in queue; removing", + (u_longlong_t)ds->ds_object); + } + } else { + zfs_dbgmsg("destroying ds %llu; ignoring", + (u_longlong_t)ds->ds_object); + } + + /* + * dsl_scan_sync() should be called after this, and should sync + * out our changed state, but just to be safe, do it here. + */ + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("snapshotting ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds1->ds_object, &mintxg) == 0) { + int err; + + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); + err = zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds1->ds_object, mintxg, tx)); + } + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + dsl_scan_sync_state(scn, tx); +} + +struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; +}; + +/* ARGSUSED */ +static int +enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + objset_t *os; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + if (dmu_objset_from_ds(ds, &os)) + goto out; + + /* + * Only the ZIL in the head (non-snapshot) is valid. Even though + * snapshots can have ZIL block pointers (which may be the same + * BP as in the head), they must be ignored. So we traverse the + * ZIL here, rather than in scan_recurse(), because the regular + * snapshot block-sharing rules don't apply to it. + */ + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) + dsl_scan_zil(dp, &os->os_zil_header); + + /* + * Iterate over the bps in this ds. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); + + char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " + "pausing=%u", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_cur_max_txg, + (int)scn->scn_pausing); + kmem_free(dsname, ZFS_MAXNAMELEN); + + if (scn->scn_pausing) + goto out; + + /* + * We've finished this pass over this dataset. + */ + + /* + * If we did not completely visit this dataset, do another pass. + */ + if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { + zfs_dbgmsg("incomplete pass; visiting again"); + scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + scn->scn_phys.scn_cur_max_txg, tx) == 0); + goto out; + } + + /* + * Add descendent datasets to work queue. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } + if (ds->ds_phys->ds_num_children > 1) { + boolean_t usenext = B_FALSE; + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, &count); + if (err == 0 && + count == ds->ds_phys->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + VERIFY(zap_join_key(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } else { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, + NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ +static void +dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) +{ + ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; + ddt_entry_t dde = { 0 }; + int error; + uint64_t n = 0; + + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + ddt_t *ddt; + + if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) + break; + dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + + /* There should be no pending changes to the dedup table */ + ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; + ASSERT(avl_first(&ddt->ddt_tree) == NULL); + + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + n++; + + if (dsl_scan_check_pause(scn, NULL)) + break; + } + + zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", + (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_pausing); + + ASSERT(error == 0 || error == ENOENT); + ASSERT(error != ENOENT || + ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); +} + +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_t zb = { 0 }; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + +static void +dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + zap_cursor_t zc; + zap_attribute_t za; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_ddt(scn, tx); + if (scn->scn_pausing) + return; + } + + if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { + /* First do the MOS & ORIGIN */ + + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_visit_rootbp(scn, NULL, + &dp->dp_meta_rootbp, tx); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + if (scn->scn_pausing) + return; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + dsl_scan_visitds(scn, + dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!scn->scn_pausing); + } else if (scn->scn_phys.scn_bookmark.zb_objset != + ZB_DESTROYED_OBJSET) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset may + * be -1, so we will skip this and find a new objset + * below. + */ + dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + if (scn->scn_pausing) + return; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + dsl_dataset_t *ds; + uint64_t dsobj; + + dsobj = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, dsobj, tx)); + + /* Set up min/max txg */ + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (za.za_first_integer != 0) { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + za.za_first_integer); + } else { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + ds->ds_phys->ds_prev_snap_txg); + } + scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); + dsl_dataset_rele(ds, FTAG); + + dsl_scan_visitds(scn, dsobj, tx); + zap_cursor_fini(&zc); + if (scn->scn_pausing) + return; + } + zap_cursor_fini(&zc); +} + +static int +dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + uint64_t elapsed_nanosecs; + + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)) + return (ERESTART); + + zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, + dmu_tx_get_txg(tx), bp, 0)); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + scn->scn_visited_this_txg++; + return (0); +} + +boolean_t +dsl_scan_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t used = 0, comp, uncomp; + + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (B_TRUE); + + if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, + &used, &comp, &uncomp); + } + return (used != 0); +} + +void +dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + int err; + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). + */ + if (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, tx->tx_txg); + dsl_scan_setup_sync(scn, &func, tx); + } + + if (!dsl_scan_active(scn) || + spa_sync_pass(dp->dp_spa) > 1) + return; + + scn->scn_visited_this_txg = 0; + scn->scn_pausing = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + /* + * First process the free list. If we pause the free, don't do + * any scanning. This ensures that there is no free list when + * we are scanning, so the scan code doesn't have to worry about + * traversing it. + */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bpobj_iterate(&dp->dp_free_bpobj, + dsl_scan_free_cb, scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj txg %llu", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + (gethrtime() - scn->scn_sync_start_time) / MICROSEC, + (longlong_t)tx->tx_txg); + scn->scn_visited_this_txg = 0; + /* + * Re-sync the ddt so that we can further modify + * it when doing bprewrite. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err == ERESTART) + return; + } + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + } else { + zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, + (longlong_t)scn->scn_phys.scn_bookmark.zb_object, + (longlong_t)scn->scn_phys.scn_bookmark.zb_level, + (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + } + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_scan_visit(scn, tx); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("visited %llu blocks in %llums", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); + + if (!scn->scn_pausing) { + /* finished with scan. */ + zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); + dsl_scan_done(scn, B_TRUE, tx); + } + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + } + + dsl_scan_sync_state(scn, tx); +} + +/* + * This will start a new scan, or restart an existing one. + */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +/* + * scrub consumers + */ + +static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + mutex_exit(&spa->spa_scrub_lock); +} + +static int +dsl_scan_scrub_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_t *zb) +{ + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int zio_priority; + int scan_delay = 0; + + if (phys_birth <= scn->scn_phys.scn_min_txg || + phys_birth >= scn->scn_phys.scn_max_txg) + return (0); + + count_block(dp->dp_blkstats, bp); + + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); + if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { + zio_flags |= ZIO_FLAG_SCRUB; + zio_priority = ZIO_PRIORITY_SCRUB; + needs_io = B_TRUE; + scan_delay = zfs_scrub_delay; + } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + zio_flags |= ZIO_FLAG_RESILVER; + zio_priority = ZIO_PRIORITY_RESILVER; + needs_io = B_FALSE; + scan_delay = zfs_resilver_delay; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + phys_birth, 1); + } + } + } + + if (needs_io && !zfs_no_scrub_io) { + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + /* + * If we're seeing recent (zfs_scan_idle) "important" I/Os + * then throttle our workload to limit the impact of a scan. + */ + if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) + delay(scan_delay); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_scan_scrub_done, NULL, zio_priority, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); +} + +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + return (dsl_sync_task_do(dp, dsl_scan_setup_check, + dsl_scan_setup_sync, dp->dp_scan, &func, 0)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c deleted file mode 100644 index 50cc069..0000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c +++ /dev/null @@ -1,1060 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); - -static scrub_cb_t dsl_pool_scrub_clean_cb; -static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; -static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object); - -int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ -int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ -boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ - -extern int zfs_txg_timeout; - -static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { - NULL, - dsl_pool_scrub_clean_cb -}; - -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -/* ARGSUSED */ -static void -dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - enum scrub_func *funcp = arg2; - dmu_object_type_t ot = 0; - boolean_t complete = B_FALSE; - - dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); - - ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); - ASSERT(*funcp > SCRUB_FUNC_NONE); - ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); - - dp->dp_scrub_min_txg = 0; - dp->dp_scrub_max_txg = tx->tx_txg; - - if (*funcp == SCRUB_FUNC_CLEAN) { - vdev_t *rvd = dp->dp_spa->spa_root_vdev; - - /* rewrite all disk labels */ - vdev_config_dirty(rvd); - - if (vdev_resilver_needed(rvd, - &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_RESILVER_START); - dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, - tx->tx_txg); - } else { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_SCRUB_START); - } - - /* zero out the scrub stats in all vdev_stat_t's */ - vdev_scrub_stat_update(rvd, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - - dp->dp_spa->spa_scrub_started = B_TRUE; - } - - /* back to the generic stuff */ - - if (dp->dp_blkstats == NULL) { - dp->dp_blkstats = - kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); - } - bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); - - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) - ot = DMU_OT_ZAP_OTHER; - - dp->dp_scrub_func = *funcp; - dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, - ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - dp->dp_scrub_restart = B_FALSE; - dp->dp_spa->spa_scrub_errors = 0; - - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &dp->dp_spa->spa_scrub_errors, tx)); - - spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, - "func=%u mintxg=%llu maxtxg=%llu", - *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); -} - -int -dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) -{ - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_setup_sync, dp, &func, 0)); -} - -/* ARGSUSED */ -static void -dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - boolean_t *completep = arg2; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - mutex_enter(&dp->dp_scrub_cancel_lock); - - if (dp->dp_scrub_restart) { - dp->dp_scrub_restart = B_FALSE; - *completep = B_FALSE; - } - - /* XXX this is scrub-clean specific */ - mutex_enter(&dp->dp_spa->spa_scrub_lock); - while (dp->dp_spa->spa_scrub_inflight > 0) { - cv_wait(&dp->dp_spa->spa_scrub_io_cv, - &dp->dp_spa->spa_scrub_lock); - } - mutex_exit(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_started = B_FALSE; - dp->dp_spa->spa_scrub_active = B_FALSE; - - dp->dp_scrub_func = SCRUB_FUNC_NONE; - VERIFY(0 == dmu_object_free(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, tx)); - dp->dp_scrub_queue_obj = 0; - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, tx)); - - spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, - "complete=%u", *completep); - - /* below is scrub-clean specific */ - vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, - *completep); - /* - * If the scrub/resilver completed, update all DTLs to reflect this. - * Whether it succeeded or not, vacate all temporary scrub DTLs. - */ - vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, - *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); - if (*completep) - spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); - spa_errlog_rotate(dp->dp_spa); - - /* - * We may have finished replacing a device. - * Let the async thread assess this and handle the detach. - */ - spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); - - dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -int -dsl_pool_scrub_cancel(dsl_pool_t *dp) -{ - boolean_t complete = B_FALSE; - - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_cancel_sync, dp, &complete, 3)); -} - -int -dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, - zio_done_func_t *done, void *private, uint32_t arc_flags) -{ - /* - * This function will be used by bp-rewrite wad to intercept frees. - */ - return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp, - done, private, arc_flags)); -} - -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb1->zb_object != -1ULL); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == -1ULL) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == 0) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == 0) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - -static boolean_t -scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) -{ - int elapsed_ticks; - int mintime; - - if (dp->dp_scrub_pausing) - return (B_TRUE); /* we're already pausing */ - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) - return (B_FALSE); /* we're resuming */ - - /* We only know how to resume from level-0 blocks. */ - if (zb->zb_level != 0) - return (B_FALSE); - - mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : - zfs_scrub_min_time; - elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; - if (elapsed_ticks > hz * zfs_txg_timeout || - (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { - dprintf("pausing at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); - dp->dp_scrub_pausing = B_TRUE; - dp->dp_scrub_bookmark = *zb; - return (B_TRUE); - } - return (B_FALSE); -} - -typedef struct zil_traverse_arg { - dsl_pool_t *zta_dp; - zil_header_t *zta_zh; -} zil_traverse_arg_t; - -/* ARGSUSED */ -static void -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; - - /* - * One block ("stubby") can be allocated a long time ago; we - * want to visit that one because it has been allocated - * (on-disk) even if it hasn't been claimed (even though for - * plain scrub there's nothing to do to it). - */ - if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) - return; - - zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); -} - -/* ARGSUSED */ -static void -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - if (lrc->lrc_txtype == TX_WRITE) { - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; - - /* - * birth can be < claim_txg if this record's txg is - * already txg sync'ed (but this log block contains - * other records that are not synced) - */ - if (claim_txg == 0 || bp->blk_birth < claim_txg) - return; - - zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = lr->lr_foid; - zb.zb_level = BP_GET_LEVEL(bp); - zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); - } -} - -static void -traverse_zil(dsl_pool_t *dp, zil_header_t *zh) -{ - uint64_t claim_txg = zh->zh_claim_txg; - zil_traverse_arg_t zta = { dp, zh }; - zilog_t *zilog; - - /* - * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). - */ - if (claim_txg == 0 && spa_writeable(dp->dp_spa)) - return; - - zilog = zil_alloc(dp->dp_meta_objset, zh); - - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, - claim_txg); - - zil_free(zilog); -} - -static void -scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, - arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) -{ - int err; - arc_buf_t *buf = NULL; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; - - if (scrub_pause(dp, zb)) - return; - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { - /* - * If we already visited this bp & everything below (in - * a prior txg), don't bother doing it again. - */ - if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) - return; - - /* - * If we found the block we're trying to resume from, or - * we went past it to a different object, zero it out to - * indicate that it's OK to start checking for pausing - * again. - */ - if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || - zb->zb_object > dp->dp_scrub_bookmark.zb_object) { - dprintf("resuming at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); - } - } - - if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - cbp = buf->b_data; - - for (i = 0; i < epb; i++, cbp++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - scrub_visitbp(dp, dnp, buf, cbp, &czb); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; - dnode_phys_t *child_dnp; - int i; - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - child_dnp = buf->b_data; - - for (i = 0; i < epb; i++, child_dnp++) { - scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset, - zb->zb_blkid * epb + i); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; - objset_phys_t *osp; - - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - - osp = buf->b_data; - - traverse_zil(dp, &osp->os_zil_header); - - scrub_visitdnode(dp, &osp->os_meta_dnode, - buf, zb->zb_objset, 0); - if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - scrub_visitdnode(dp, &osp->os_userused_dnode, - buf, zb->zb_objset, 0); - scrub_visitdnode(dp, &osp->os_groupused_dnode, - buf, zb->zb_objset, 0); - } - } - - (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); - if (buf) - (void) arc_buf_remove_ref(buf, &buf); -} - -static void -scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object) -{ - int j; - - for (j = 0; j < dnp->dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); - } - -} - -static void -scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) -{ - zbookmark_t zb; - - SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0); - scrub_visitbp(dp, NULL, NULL, bp, &zb); -} - -void -dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0); - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) != 0) { - return; - } - - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -} - -void -dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - dp->dp_scrub_bookmark.zb_objset = - ds->ds_phys->ds_prev_snap_obj; - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_prev_snap_obj, tx) == 0); - } -} - -void -dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; - } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; - } - - if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds1->ds_object, tx) == 0) { - int err = zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds2->ds_object, tx); - VERIFY(err == 0 || err == EEXIST); - if (err == EEXIST) { - /* Both were there to begin with */ - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds2->ds_object, tx) == 0) { - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } -} - -struct enqueue_clones_arg { - dmu_tx_t *tx; - uint64_t originobj; -}; - -/* ARGSUSED */ -static int -enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - struct enqueue_clones_arg *eca = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - dp = ds->ds_dir->dd_pool; - - if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { - while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - ds = prev; - } - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, eca->tx) == 0); - } - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) -{ - dsl_dataset_t *ds; - uint64_t min_txg_save; - - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - - /* - * Iterate over the bps in this ds. - */ - min_txg_save = dp->dp_scrub_min_txg; - dp->dp_scrub_min_txg = - MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); - scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); - dp->dp_scrub_min_txg = min_txg_save; - - if (dp->dp_scrub_pausing) - goto out; - - /* - * Add descendent datasets to work queue. - */ - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - if (ds->ds_phys->ds_num_children > 1) { - boolean_t usenext = B_FALSE; - if (ds->ds_phys->ds_next_clones_obj != 0) { - uint64_t count; - /* - * A bug in a previous version of the code could - * cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a - * missing entry. Therefore we can only use the - * next_clones_obj when its count is correct. - */ - int err = zap_count(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, &count); - if (err == 0 && - count == ds->ds_phys->ds_num_children - 1) - usenext = B_TRUE; - } - - if (usenext) { - VERIFY(zap_join(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - dp->dp_scrub_queue_obj, tx) == 0); - } else { - struct enqueue_clones_arg eca; - eca.tx = tx; - eca.originobj = ds->ds_object; - - (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, - NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); - } - } - -out: - dsl_dataset_rele(ds, FTAG); -} - -/* ARGSUSED */ -static int -enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - dmu_tx_t *tx = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - - dp = ds->ds_dir->dd_pool; - - while (ds->ds_phys->ds_prev_snap_obj != 0) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - FTAG, &prev); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - /* - * If this is a clone, we don't need to worry about it for now. - */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_rele(ds, FTAG); - dsl_dataset_rele(prev, FTAG); - return (0); - } - dsl_dataset_rele(ds, FTAG); - ds = prev; - } - - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0); - dsl_dataset_rele(ds, FTAG); - return (0); -} - -void -dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) -{ - spa_t *spa = dp->dp_spa; - zap_cursor_t zc; - zap_attribute_t za; - boolean_t complete = B_TRUE; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - /* - * If the pool is not loaded, or is trying to unload, leave it alone. - */ - if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa)) - return; - - if (dp->dp_scrub_restart) { - enum scrub_func func = dp->dp_scrub_func; - dp->dp_scrub_restart = B_FALSE; - dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); - } - - if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { - /* - * We must have resumed after rebooting; reset the vdev - * stats to know that we're doing a scrub (although it - * will think we're just starting now). - */ - vdev_scrub_stat_update(spa->spa_root_vdev, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - } - - dp->dp_scrub_pausing = B_FALSE; - dp->dp_scrub_start_time = lbolt64; - dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); - spa->spa_scrub_active = B_TRUE; - - if (dp->dp_scrub_bookmark.zb_objset == 0) { - /* First do the MOS & ORIGIN */ - scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); - if (dp->dp_scrub_pausing) - goto out; - - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(spa, - NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); - } else { - scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); - } - ASSERT(!dp->dp_scrub_pausing); - } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) { - /* - * If we were paused, continue from here. Note if the - * ds we were paused on was deleted, the zb_objset will - * be -1, so we will skip this and find a new objset - * below. - */ - scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); - if (dp->dp_scrub_pausing) - goto out; - } - - /* - * In case we were paused right at the end of the ds, zero the - * bookmark so we don't think that we're still trying to resume. - */ - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - - /* keep pulling things out of the zap-object-as-queue */ - while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), - zap_cursor_retrieve(&zc, &za) == 0) { - VERIFY(0 == zap_remove(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, za.za_name, tx)); - scrub_visitds(dp, za.za_first_integer, tx); - if (dp->dp_scrub_pausing) - break; - zap_cursor_fini(&zc); - } - zap_cursor_fini(&zc); - if (dp->dp_scrub_pausing) - goto out; - - /* done. */ - - dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); - return; -out: - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors, tx)); - - /* XXX this is scrub-clean specific */ - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - mutex_exit(&spa->spa_scrub_lock); -} - -void -dsl_pool_scrub_restart(dsl_pool_t *dp) -{ - mutex_enter(&dp->dp_scrub_cancel_lock); - dp->dp_scrub_restart = B_TRUE; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -/* - * scrub consumers - */ - -static void -count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) -{ - int i; - - /* - * If we resume after a reboot, zab will be NULL; don't record - * incomplete stats in that case. - */ - if (zab == NULL) - return; - - for (i = 0; i < 4; i++) { - int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; - int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; - int equal; - - zb->zb_count++; - zb->zb_asize += BP_GET_ASIZE(bp); - zb->zb_lsize += BP_GET_LSIZE(bp); - zb->zb_psize += BP_GET_PSIZE(bp); - zb->zb_gangs += BP_COUNT_GANG(bp); - - switch (BP_GET_NDVAS(bp)) { - case 2: - if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) - zb->zb_ditto_2_of_2_samevdev++; - break; - case 3: - equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + - (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[2])) + - (DVA_GET_VDEV(&bp->blk_dva[1]) == - DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal == 1) - zb->zb_ditto_2_of_3_samevdev++; - else if (equal == 3) - zb->zb_ditto_3_of_3_samevdev++; - break; - } - } -} - -static void -dsl_pool_scrub_clean_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - - zio_data_buf_free(zio->io_data, zio->io_size); - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - cv_broadcast(&spa->spa_scrub_io_cv); - - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) - spa->spa_scrub_errors++; - mutex_exit(&spa->spa_scrub_lock); -} - -static int -dsl_pool_scrub_clean_cb(dsl_pool_t *dp, - const blkptr_t *bp, const zbookmark_t *zb) -{ - size_t size = BP_GET_PSIZE(bp); - spa_t *spa = dp->dp_spa; - boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int zio_priority; - - ASSERT(bp->blk_birth > dp->dp_scrub_min_txg); - - if (bp->blk_birth >= dp->dp_scrub_max_txg) - return (0); - - count_block(dp->dp_blkstats, bp); - - if (dp->dp_scrub_isresilver == 0) { - /* It's a scrub */ - zio_flags |= ZIO_FLAG_SCRUB; - zio_priority = ZIO_PRIORITY_SCRUB; - needs_io = B_TRUE; - } else { - /* It's a resilver */ - zio_flags |= ZIO_FLAG_RESILVER; - zio_priority = ZIO_PRIORITY_RESILVER; - needs_io = B_FALSE; - } - - /* If it's an intent log block, failure is expected. */ - if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) - zio_flags |= ZIO_FLAG_SPECULATIVE; - - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); - - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_examined += - DVA_GET_ASIZE(&bp->blk_dva[d]); - mutex_exit(&vd->vdev_stat_lock); - - /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - bp->blk_birth, 1); - } - } - } - - if (needs_io && !zfs_no_scrub_io) { - void *data = zio_data_buf_alloc(size); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; - mutex_exit(&spa->spa_scrub_lock); - - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_pool_scrub_clean_done, NULL, zio_priority, - zio_flags, zb)); - } - - /* do not relocate this block */ - return (0); -} - -int -dsl_pool_scrub_clean(dsl_pool_t *dp) -{ - spa_t *spa = dp->dp_spa; - - /* - * Purge all vdev caches. We do this here rather than in sync - * context because this requires a writer lock on the spa_config - * lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. - */ - spa_vdev_state_enter(spa); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); - - return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c index 2110022..b0818ce 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c @@ -19,18 +19,15 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include #include #include -#include +#include #define DST_AVG_BLKSHIFT 14 @@ -50,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp) list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), offsetof(dsl_sync_task_t, dst_node)); dstg->dstg_pool = dp; - dstg->dstg_cr = CRED(); return (dstg); } @@ -112,14 +108,21 @@ top: return (dstg->dstg_err); } - VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); + /* + * We don't generally have many sync tasks, so pay the price of + * add_tail to get the tasks executed in the right order. + */ + VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, + dstg, txg)); dmu_tx_commit(tx); txg_wait_synced(dstg->dstg_pool, txg); - if (dstg->dstg_err == EAGAIN) + if (dstg->dstg_err == EAGAIN) { + txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); goto top; + } return (dstg->dstg_err); } @@ -131,7 +134,12 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) dstg->dstg_nowaiter = B_TRUE; txg = dmu_tx_get_txg(tx); - VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); + /* + * We don't generally have many sync tasks, so pay the price of + * add_tail to get the tasks executed in the right order. + */ + VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, + dstg, txg)); } void @@ -150,25 +158,30 @@ void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) { dsl_sync_task_t *dst; - void *tr_cookie; + dsl_pool_t *dp = dstg->dstg_pool; + uint64_t quota, used; ASSERT3U(dstg->dstg_err, ==, 0); /* - * Check for sufficient space. + * Check for sufficient space. We just check against what's + * on-disk; we don't want any in-flight accounting to get in our + * way, because open context may have already used up various + * in-core limits (arc_tempreserve, dsl_pool_tempreserve). */ - dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, - dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx); - /* don't bother trying again */ - if (dstg->dstg_err == ERESTART) - dstg->dstg_err = EAGAIN; - if (dstg->dstg_err) + quota = dsl_pool_adjustedsize(dp, B_FALSE) - + metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + used = dp->dp_root_dir->dd_phys->dd_used_bytes; + /* MOS space is triple-dittoed, so we multiply by 3. */ + if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) { + dstg->dstg_err = ENOSPC; return; + } /* * Check for errors by calling checkfuncs. */ - rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER); + rw_enter(&dp->dp_config_rwlock, RW_WRITER); for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { dst->dst_err = @@ -183,13 +196,10 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) */ for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, - dstg->dstg_cr, tx); + dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); } } - rw_exit(&dstg->dstg_pool->dp_config_rwlock); - - dsl_dir_tempreserve_clear(tr_cookie, tx); + rw_exit(&dp->dp_config_rwlock); if (dstg->dstg_nowaiter) dsl_sync_task_group_destroy(dstg); @@ -203,6 +213,8 @@ dsl_sync_task_do(dsl_pool_t *dp, dsl_sync_task_group_t *dstg; int err; + ASSERT(spa_writeable(dp->dp_spa)); + dstg = dsl_sync_task_group_create(dp); dsl_sync_task_create(dstg, checkfunc, syncfunc, arg1, arg2, blocks_modified); @@ -218,6 +230,9 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp, { dsl_sync_task_group_t *dstg; + if (!spa_writeable(dp->dp_spa)) + return; + dstg = dsl_sync_task_group_create(dp); dsl_sync_task_create(dstg, checkfunc, syncfunc, arg1, arg2, blocks_modified); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c deleted file mode 100644 index 54247d7..0000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Fletcher Checksums - * ------------------ - * - * ZFS's 2nd and 4th order Fletcher checksums are defined by the following - * recurrence relations: - * - * a = a + f - * i i-1 i-1 - * - * b = b + a - * i i-1 i - * - * c = c + b (fletcher-4 only) - * i i-1 i - * - * d = d + c (fletcher-4 only) - * i i-1 i - * - * Where - * a_0 = b_0 = c_0 = d_0 = 0 - * and - * f_0 .. f_(n-1) are the input data. - * - * Using standard techniques, these translate into the following series: - * - * __n_ __n_ - * \ | \ | - * a = > f b = > i * f - * n /___| n - i n /___| n - i - * i = 1 i = 1 - * - * - * __n_ __n_ - * \ | i*(i+1) \ | i*(i+1)*(i+2) - * c = > ------- f d = > ------------- f - * n /___| 2 n - i n /___| 6 n - i - * i = 1 i = 1 - * - * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. - * Since the additions are done mod (2^64), errors in the high bits may not - * be noticed. For this reason, fletcher-2 is deprecated. - * - * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. - * A conservative estimate of how big the buffer can get before we overflow - * can be estimated using f_i = 0xffffffff for all i: - * - * % bc - * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 - * 2264 - * quit - * % - * - * So blocks of up to 2k will not overflow. Our largest block size is - * 128k, which has 32k 4-byte words, so we can compute the largest possible - * accumulators, then divide by 2^64 to figure the max amount of overflow: - * - * % bc - * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } - * a/2^64;b/2^64;c/2^64;d/2^64 - * 0 - * 0 - * 1365 - * 11186858 - * quit - * % - * - * So a and b cannot overflow. To make sure each bit of input has some - * effect on the contents of c and d, we can look at what the factors of - * the coefficients in the equations for c_n and d_n are. The number of 2s - * in the factors determines the lowest set bit in the multiplier. Running - * through the cases for n*(n+1)/2 reveals that the highest power of 2 is - * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow - * the 64-bit accumulators, every bit of every f_i effects every accumulator, - * even for 128k blocks. - * - * If we wanted to make a stronger version of fletcher4 (fletcher4c?), - * we could do our calculations mod (2^32 - 1) by adding in the carries - * periodically, and store the number of carries in the top 32-bits. - * - * -------------------- - * Checksum Performance - * -------------------- - * - * There are two interesting components to checksum performance: cached and - * uncached performance. With cached data, fletcher-2 is about four times - * faster than fletcher-4. With uncached data, the performance difference is - * negligible, since the cost of a cache fill dominates the processing time. - * Even though fletcher-4 is slower than fletcher-2, it is still a pretty - * efficient pass over the data. - * - * In normal operation, the data which is being checksummed is in a buffer - * which has been filled either by: - * - * 1. a compression step, which will be mostly cached, or - * 2. a bcopy() or copyin(), which will be uncached (because the - * copy is cache-bypassing). - * - * For both cached and uncached data, both fletcher checksums are much faster - * than sha-256, and slower than 'off', which doesn't touch the data at all. - */ - -#include -#include -#include -#include - -void -fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint64_t *ip = buf; - const uint64_t *ipend = ip + (size / sizeof (uint64_t)); - uint64_t a0, b0, a1, b1; - - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { - a0 += ip[0]; - a1 += ip[1]; - b0 += a0; - b1 += a1; - } - - ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); -} - -void -fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint64_t *ip = buf; - const uint64_t *ipend = ip + (size / sizeof (uint64_t)); - uint64_t a0, b0, a1, b1; - - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { - a0 += BSWAP_64(ip[0]); - a1 += BSWAP_64(ip[1]); - b0 += a0; - b1 += a1; - } - - ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); -} - -void -fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_incremental_native(const void *buf, uint64_t size, - zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; - - for (; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; - - for (; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c index a88b85c..a2d9dab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c @@ -20,21 +20,20 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* - * We keep our own copy of this algorithm for 2 main reasons: - * 1. If we didn't, anyone modifying common/os/compress.c would + * We keep our own copy of this algorithm for 3 main reasons: + * 1. If we didn't, anyone modifying common/os/compress.c would * directly break our on disk format - * 2. Our version of lzjb does not have a number of checks that the + * 2. Our version of lzjb does not have a number of checks that the * common/os version needs and uses + * 3. We initialize the lempel to ensure deterministic results, + * so that identical blocks can always be deduplicated. * In particular, we are adding the "feature" that compress() can - * take a destination buffer size and return -1 if the data will not - * compress to d_len or less. + * take a destination buffer size and returns the compressed length, or the + * source length if compression would overflow the destination buffer. */ #include @@ -44,7 +43,7 @@ #define MATCH_MIN 3 #define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) #define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) -#define LEMPEL_SIZE 256 +#define LEMPEL_SIZE 1024 /*ARGSUSED*/ size_t @@ -54,20 +53,14 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) uchar_t *dst = d_start; uchar_t *cpy, *copymap; int copymask = 1 << (NBBY - 1); - int mlen, offset; + int mlen, offset, hash; uint16_t *hp; - uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */ + uint16_t lempel[LEMPEL_SIZE] = { 0 }; while (src < (uchar_t *)s_start + s_len) { if ((copymask <<= 1) == (1 << NBBY)) { - if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { - if (d_len != s_len) - return (s_len); - mlen = s_len; - for (src = s_start, dst = d_start; mlen; mlen--) - *dst++ = *src++; + if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) return (s_len); - } copymask = 1; copymap = dst; *dst++ = 0; @@ -76,8 +69,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) *dst++ = *src++; continue; } - hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) & - (LEMPEL_SIZE - 1)]; + hash = (src[0] << 16) + (src[1] << 8) + src[2]; + hash += hash >> 9; + hash += hash >> 5; + hp = &lempel[hash & (LEMPEL_SIZE - 1)]; offset = (intptr_t)(src - *hp) & OFFSET_MASK; *hp = (uint16_t)(uintptr_t)src; cpy = src - offset; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index c5ce27c..17b4b12 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -23,7 +23,6 @@ */ #include -#include #include #include #include @@ -35,6 +34,11 @@ uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * Metaslab debugging: when set, keeps all space maps in core to verify frees. + */ +static int metaslab_debug = 0; + +/* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more @@ -72,12 +76,13 @@ int metaslab_smo_bonus_pct = 150; * ========================================================================== */ metaslab_class_t * -metaslab_class_create(space_map_ops_t *ops) +metaslab_class_create(spa_t *spa, space_map_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; @@ -87,58 +92,73 @@ metaslab_class_create(space_map_ops_t *ops) void metaslab_class_destroy(metaslab_class_t *mc) { - metaslab_group_t *mg; - - while ((mg = mc->mc_rotor) != NULL) { - metaslab_class_remove(mc, mg); - metaslab_group_destroy(mg); - } + ASSERT(mc->mc_rotor == NULL); + ASSERT(mc->mc_alloc == 0); + ASSERT(mc->mc_deferred == 0); + ASSERT(mc->mc_space == 0); + ASSERT(mc->mc_dspace == 0); kmem_free(mc, sizeof (metaslab_class_t)); } -void -metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) +int +metaslab_class_validate(metaslab_class_t *mc) { - metaslab_group_t *mgprev, *mgnext; + metaslab_group_t *mg; + vdev_t *vd; - ASSERT(mg->mg_class == NULL); + /* + * Must hold one of the spa_config locks. + */ + ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || + spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mgprev = mc->mc_rotor) == NULL) { - mg->mg_prev = mg; - mg->mg_next = mg; - } else { - mgnext = mgprev->mg_next; - mg->mg_prev = mgprev; - mg->mg_next = mgnext; - mgprev->mg_next = mg; - mgnext->mg_prev = mg; - } - mc->mc_rotor = mg; - mg->mg_class = mc; + if ((mg = mc->mc_rotor) == NULL) + return (0); + + do { + vd = mg->mg_vd; + ASSERT(vd->vdev_mg != NULL); + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(mg->mg_class, ==, mc); + ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); + } while ((mg = mg->mg_next) != mc->mc_rotor); + + return (0); } void -metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) +metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { - metaslab_group_t *mgprev, *mgnext; + atomic_add_64(&mc->mc_alloc, alloc_delta); + atomic_add_64(&mc->mc_deferred, defer_delta); + atomic_add_64(&mc->mc_space, space_delta); + atomic_add_64(&mc->mc_dspace, dspace_delta); +} - ASSERT(mg->mg_class == mc); +uint64_t +metaslab_class_get_alloc(metaslab_class_t *mc) +{ + return (mc->mc_alloc); +} - mgprev = mg->mg_prev; - mgnext = mg->mg_next; +uint64_t +metaslab_class_get_deferred(metaslab_class_t *mc) +{ + return (mc->mc_deferred); +} - if (mg == mgnext) { - mc->mc_rotor = NULL; - } else { - mc->mc_rotor = mgnext; - mgprev->mg_next = mgnext; - mgnext->mg_prev = mgprev; - } +uint64_t +metaslab_class_get_space(metaslab_class_t *mc) +{ + return (mc->mc_space); +} - mg->mg_prev = NULL; - mg->mg_next = NULL; - mg->mg_class = NULL; +uint64_t +metaslab_class_get_dspace(metaslab_class_t *mc) +{ + return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } /* @@ -179,9 +199,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); mg->mg_vd = vd; - metaslab_class_add(mc, mg); + mg->mg_class = mc; + mg->mg_activation_count = 0; return (mg); } @@ -189,11 +209,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) void metaslab_group_destroy(metaslab_group_t *mg) { + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + /* + * We may have gone below zero with the activation count + * either because we never activated in the first place or + * because we're done, and possibly removing the vdev. + */ + ASSERT(mg->mg_activation_count <= 0); + avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); kmem_free(mg, sizeof (metaslab_group_t)); } +void +metaslab_group_activate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count <= 0); + + if (++mg->mg_activation_count <= 0) + return; + + mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; +} + +void +metaslab_group_passivate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + if (--mg->mg_activation_count != 0) { + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count < 0); + return; + } + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; +} + static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { @@ -611,6 +702,13 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_group_add(mg, msp); + if (metaslab_debug && smo->smo_object != 0) { + mutex_enter(&msp->ms_lock); + VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, + SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); + mutex_exit(&msp->ms_lock); + } + /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. @@ -621,16 +719,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_sync_done(msp, 0); if (txg != 0) { - /* - * The vdev is dirty, but the metaslab isn't -- it just needs - * to have metaslab_sync_done() invoked from vdev_sync_done(). - * [We could just dirty the metaslab, but that would cause us - * to allocate a space map object for it, which is wasteful - * and would mess up the locality logic in metaslab_weight().] - */ - ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); vdev_dirty(vd, 0, NULL, txg); - vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); + vdev_dirty(vd, VDD_METASLAB, msp, txg); } return (msp); @@ -640,10 +730,9 @@ void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; - int t; - vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc, B_TRUE); + vdev_space_update(mg->mg_vd, + -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); metaslab_group_remove(mg, msp); @@ -652,11 +741,16 @@ metaslab_fini(metaslab_t *msp) space_map_unload(&msp->ms_map); space_map_destroy(&msp->ms_map); - for (t = 0; t < TXG_SIZE; t++) { + for (int t = 0; t < TXG_SIZE; t++) { space_map_destroy(&msp->ms_allocmap[t]); space_map_destroy(&msp->ms_freemap[t]); } + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_destroy(&msp->ms_defermap[t]); + + ASSERT3S(msp->ms_deferspace, ==, 0); + mutex_exit(&msp->ms_lock); mutex_destroy(&msp->ms_lock); @@ -741,7 +835,7 @@ metaslab_prefetch(metaslab_group_t *mg) if (!sm->sm_loaded && smo->smo_object != 0) { mutex_exit(&mg->mg_lock); - dmu_prefetch(spa->spa_meta_objset, smo->smo_object, + dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 0ULL, smo->smo_objsize); mutex_enter(&mg->mg_lock); } @@ -759,11 +853,19 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, - msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); - if (error) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); + space_map_load_wait(sm); + if (!sm->sm_loaded) { + int error = space_map_load(sm, sm_ops, SM_FREE, + &msp->ms_smo, + spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_claim, sm); + } /* @@ -812,7 +914,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; + objset_t *mos = spa_meta_objset(spa); space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; @@ -820,9 +922,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_obj_t *smo = &msp->ms_smo_syncing; dmu_buf_t *db; dmu_tx_t *tx; - int t; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + ASSERT(!vd->vdev_ishole); + + if (allocmap->sm_space == 0 && freemap->sm_space == 0) + return; /* * The only state that can actually be changing concurrently with @@ -832,12 +936,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * We drop it whenever we call into the DMU, because the DMU * can call down to us (e.g. via zio_free()) at any time. */ - mutex_enter(&msp->ms_lock); + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (smo->smo_object == 0) { ASSERT(smo->smo_objsize == 0); ASSERT(smo->smo_alloc == 0); - mutex_exit(&msp->ms_lock); smo->smo_object = dmu_object_alloc(mos, DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); @@ -845,9 +949,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * (sm->sm_start >> vd->vdev_ms_shift), sizeof (uint64_t), &smo->smo_object, tx); - mutex_enter(&msp->ms_lock); } + mutex_enter(&msp->ms_lock); + space_map_walk(freemap, space_map_add, freed_map); if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= @@ -860,6 +965,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * This metaslab is 100% allocated, * minus the content of the in-core map (sm), * minus what's been freed this txg (freed_map), + * minus deferred frees (ms_defermap[]), * minus allocations from txgs in the future * (because they haven't been committed yet). */ @@ -871,7 +977,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_walk(sm, space_map_remove, allocmap); space_map_walk(freed_map, space_map_remove, allocmap); - for (t = 1; t < TXG_CONCURRENT_STATES; t++) + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_remove, allocmap); + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], space_map_remove, allocmap); @@ -905,9 +1015,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_obj_t *smosync = &msp->ms_smo_syncing; space_map_t *sm = &msp->ms_map; space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; - int t; + int64_t alloc_delta, defer_delta; + + ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); @@ -916,16 +1029,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * allocmaps and freemaps and add its capacity to the vdev. */ if (freed_map->sm_size == 0) { - for (t = 0; t < TXG_SIZE; t++) { + for (int t = 0; t < TXG_SIZE; t++) { space_map_create(&msp->ms_allocmap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0, B_TRUE); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_create(&msp->ms_defermap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + + vdev_space_update(vd, 0, 0, sm->sm_size); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); + alloc_delta = smosync->smo_alloc - smo->smo_alloc; + defer_delta = freed_map->sm_space - defer_map->sm_space; + + vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); @@ -933,13 +1054,26 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If there's a space_map_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. - * Then, add everything we freed in this txg to the map. + * Then, add defer_map (oldest deferred frees) to this map and + * transfer freed_map (this txg's frees) to defer_map. */ space_map_load_wait(sm); - space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(freed_map, space_map_add, defer_map); *smo = *smosync; + msp->ms_deferspace += defer_delta; + ASSERT3S(msp->ms_deferspace, >=, 0); + ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); + if (msp->ms_deferspace != 0) { + /* + * Keep syncing this metaslab until all deferred frees + * are back in circulation. + */ + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + } + /* * If the map is loaded but no longer active, evict it as soon as all * future allocations have synced. (If we unloaded it now and then @@ -948,11 +1082,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { int evictable = 1; - for (t = 1; t < TXG_CONCURRENT_STATES; t++) + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) evictable = 0; - if (evictable) + if (evictable && !metaslab_debug) space_map_unload(sm); } @@ -1119,12 +1253,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0) + if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) return (ENOSPC); /* * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_allocated because + * Note that there's no locking on mc_rotor or mc_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * @@ -1146,10 +1280,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - if (flags & METASLAB_HINTBP_AVOID) - mg = vd->vdev_mg->mg_next; - else + + /* + * It's possible the vdev we're using as the hint no + * longer exists (i.e. removed). Consult the rotor when + * all else fails. + */ + if (vd != NULL) { mg = vd->vdev_mg; + + if (flags & METASLAB_HINTBP_AVOID && + mg->mg_next != NULL) + mg = mg->mg_next; + } else { + mg = mc->mc_rotor; + } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; @@ -1158,15 +1303,18 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } /* - * If the hint put us into the wrong class, just follow the rotor. + * If the hint put us into the wrong metaslab class, or into a + * metaslab group that has been passivated, just follow the rotor. */ - if (mg->mg_class != mc) + if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: all_zero = B_TRUE; do { + ASSERT(mg->mg_activation_count == 1); + vd = mg->mg_vd; /* @@ -1211,32 +1359,28 @@ top: * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ - if (mc->mc_allocated == 0) { + if (mc->mc_aliquot == 0) { vdev_stat_t *vs = &vd->vdev_stat; - uint64_t alloc, space; - int64_t vu, su; - - alloc = spa_get_alloc(spa); - space = spa_get_space(spa); + int64_t vu, cu; /* * Determine percent used in units of 0..1024. * (This is just to avoid floating point.) */ vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - su = (alloc << 10) / (space + 1); + cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); /* * Bias by at most +/- 25% of the aliquot. */ - mg->mg_bias = ((su - vu) * + mg->mg_bias = ((cu - vu) * (int64_t)mg->mg_aliquot) / (1024 * 4); } - if (atomic_add_64_nv(&mc->mc_allocated, asize) >= + if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -1248,7 +1392,7 @@ top: } next: mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); if (!all_zero) { @@ -1328,7 +1472,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; metaslab_t *msp; - int error; + int error = 0; ASSERT(DVA_IS_VALID(dva)); @@ -1343,7 +1487,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + + if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) + error = ENOENT; + if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); @@ -1371,6 +1520,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int error = 0; ASSERT(bp->blk_birth == 0); + ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -1400,7 +1550,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, spa_config_exit(spa, SCL_ALLOC, FTAG); - bp->blk_birth = txg; + BP_SET_BIRTH(bp, txg, txg); return (0); } @@ -1412,7 +1562,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c index 5fe4e63..6d8e2f2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -19,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include -#if defined(DEBUG) || !defined(_KERNEL) +#ifdef ZFS_DEBUG #ifdef _KERNEL int reference_tracking_enable = FALSE; /* runs out of memory too easily */ @@ -192,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder) return (refcount_remove_many(rc, 1, holder)); } -#endif +void +refcount_transfer(refcount_t *dst, refcount_t *src) +{ + int64_t count, removed_count; + list_t list, removed; + + list_create(&list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + + mutex_enter(&src->rc_mtx); + count = src->rc_count; + removed_count = src->rc_removed_count; + src->rc_count = 0; + src->rc_removed_count = 0; + list_move_tail(&list, &src->rc_list); + list_move_tail(&removed, &src->rc_removed); + mutex_exit(&src->rc_mtx); + + mutex_enter(&dst->rc_mtx); + dst->rc_count += count; + dst->rc_removed_count += removed_count; + list_move_tail(&dst->rc_list, &list); + list_move_tail(&dst->rc_removed, &removed); + mutex_exit(&dst->rc_mtx); + + list_destroy(&list); + list_destroy(&removed); +} + +#endif /* ZFS_DEBUG */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c new file mode 100644 index 0000000..4db13fd --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c @@ -0,0 +1,1970 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS System attributes: + * + * A generic mechanism to allow for arbitrary attributes + * to be stored in a dnode. The data will be stored in the bonus buffer of + * the dnode and if necessary a special "spill" block will be used to handle + * overflow situations. The spill block will be sized to fit the data + * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the + * spill block is stored at the end of the current bonus buffer. Any + * attributes that would be in the way of the blkptr_t will be relocated + * into the spill block. + * + * Attribute registration: + * + * Stored persistently on a per dataset basis + * a mapping between attribute "string" names and their actual attribute + * numeric values, length, and byteswap function. The names are only used + * during registration. All attributes are known by their unique attribute + * id value. If an attribute can have a variable size then the value + * 0 will be used to indicate this. + * + * Attribute Layout: + * + * Attribute layouts are a way to compactly store multiple attributes, but + * without taking the overhead associated with managing each attribute + * individually. Since you will typically have the same set of attributes + * stored in the same order a single table will be used to represent that + * layout. The ZPL for example will usually have only about 10 different + * layouts (regular files, device files, symlinks, + * regular files + scanstamp, files/dir with extended attributes, and then + * you have the possibility of all of those minus ACL, because it would + * be kicked out into the spill block) + * + * Layouts are simply an array of the attributes and their + * ordering i.e. [0, 1, 4, 5, 2] + * + * Each distinct layout is given a unique layout number and that is whats + * stored in the header at the beginning of the SA data buffer. + * + * A layout only covers a single dbuf (bonus or spill). If a set of + * attributes is split up between the bonus buffer and a spill buffer then + * two different layouts will be used. This allows us to byteswap the + * spill without looking at the bonus buffer and keeps the on disk format of + * the bonus and spill buffer the same. + * + * Adding a single attribute will cause the entire set of attributes to + * be rewritten and could result in a new layout number being constructed + * as part of the rewrite if no such layout exists for the new set of + * attribues. The new attribute will be appended to the end of the already + * existing attributes. + * + * Both the attribute registration and attribute layout information are + * stored in normal ZAP attributes. Their should be a small number of + * known layouts and the set of attributes is assumed to typically be quite + * small. + * + * The registered attributes and layout "table" information is maintained + * in core and a special "sa_os_t" is attached to the objset_t. + * + * A special interface is provided to allow for quickly applying + * a large set of attributes at once. sa_replace_all_by_template() is + * used to set an array of attributes. This is used by the ZPL when + * creating a brand new file. The template that is passed into the function + * specifies the attribute, size for variable length attributes, location of + * data and special "data locator" function if the data isn't in a contiguous + * location. + * + * Byteswap implications: + * Since the SA attributes are not entirely self describing we can't do + * the normal byteswap processing. The special ZAP layout attribute and + * attribute registration attributes define the byteswap function and the + * size of the attributes, unless it is variable sized. + * The normal ZFS byteswapping infrastructure assumes you don't need + * to read any objects in order to do the necessary byteswapping. Whereas + * SA attributes can only be properly byteswapped if the dataset is opened + * and the layout/attribute ZAP attributes are available. Because of this + * the SA attributes will be byteswapped when they are first accessed by + * the SA code that will read the SA data. + */ + +typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, + uint16_t length, int length_idx, boolean_t, void *userp); + +static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); +static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); +static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, + void *data); +static void sa_idx_tab_rele(objset_t *os, void *arg); +static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, + int buflen); +static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx); + +arc_byteswap_func_t *sa_bswap_table[] = { + byteswap_uint64_array, + byteswap_uint32_array, + byteswap_uint16_array, + byteswap_uint8_array, + zfs_acl_byteswap, +}; + +#define SA_COPY_DATA(f, s, t, l) \ + { \ + if (f == NULL) { \ + if (l == 8) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + } else if (l == 16) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + *(uint64_t *)((uintptr_t)t + 8) = \ + *(uint64_t *)((uintptr_t)s + 8); \ + } else { \ + bcopy(s, t, l); \ + } \ + } else \ + sa_copy_data(f, s, t, l); \ + } + +/* + * This table is fixed and cannot be changed. Its purpose is to + * allow the SA code to work with both old/new ZPL file systems. + * It contains the list of legacy attributes. These attributes aren't + * stored in the "attribute" registry zap objects, since older ZPL file systems + * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will + * use this static table. + */ +sa_attr_reg_t sa_legacy_attrs[] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, +}; + +/* + * ZPL legacy layout + * This is only used for objects of type DMU_OT_ZNODE + */ +sa_attr_type_t sa_legacy_zpl_layout[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +/* + * Special dummy layout used for buffers with no attributes. + */ + +sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; + +static int sa_legacy_attr_count = 16; +static kmem_cache_t *sa_cache = NULL; + +/*ARGSUSED*/ +static int +sa_cache_constructor(void *buf, void *unused, int kmflag) +{ + sa_handle_t *hdl = buf; + + hdl->sa_bonus_tab = NULL; + hdl->sa_spill_tab = NULL; + hdl->sa_os = NULL; + hdl->sa_userp = NULL; + hdl->sa_bonus = NULL; + hdl->sa_spill = NULL; + mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/*ARGSUSED*/ +static void +sa_cache_destructor(void *buf, void *unused) +{ + sa_handle_t *hdl = buf; + mutex_destroy(&hdl->sa_lock); +} + +void +sa_cache_init(void) +{ + sa_cache = kmem_cache_create("sa_cache", + sizeof (sa_handle_t), 0, sa_cache_constructor, + sa_cache_destructor, NULL, NULL, NULL, 0); +} + +void +sa_cache_fini(void) +{ + if (sa_cache) + kmem_cache_destroy(sa_cache); +} + +static int +layout_num_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = arg1; + const sa_lot_t *node2 = arg2; + + if (node1->lot_num > node2->lot_num) + return (1); + else if (node1->lot_num < node2->lot_num) + return (-1); + return (0); +} + +static int +layout_hash_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = arg1; + const sa_lot_t *node2 = arg2; + + if (node1->lot_hash > node2->lot_hash) + return (1); + if (node1->lot_hash < node2->lot_hash) + return (-1); + if (node1->lot_instance > node2->lot_instance) + return (1); + if (node1->lot_instance < node2->lot_instance) + return (-1); + return (0); +} + +boolean_t +sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) +{ + int i; + + if (count != tbf->lot_attr_count) + return (1); + + for (i = 0; i != count; i++) { + if (attrs[i] != tbf->lot_attrs[i]) + return (1); + } + return (0); +} + +#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) + +static uint64_t +sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) +{ + int i; + uint64_t crc = -1ULL; + + for (i = 0; i != attr_count; i++) + crc ^= SA_ATTR_HASH(attrs[i]); + + return (crc); +} + +static int +sa_get_spill(sa_handle_t *hdl) +{ + int rc; + if (hdl->sa_spill == NULL) { + if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, + &hdl->sa_spill)) == 0) + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } else { + rc = 0; + } + + return (rc); +} + +/* + * Main attribute lookup/update function + * returns 0 for success or non zero for failures + * + * Operates on bulk array, first failure will abort further processing + */ +int +sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + sa_data_op_t data_op, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + int i; + int error = 0; + sa_buf_type_t buftypes; + + buftypes = 0; + + ASSERT(count > 0); + for (i = 0; i != count; i++) { + ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); + + bulk[i].sa_addr = NULL; + /* First check the bonus buffer */ + + if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( + hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_bonus_tab, + SA_GET_HDR(hdl, SA_BONUS), + bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); + if (tx && !(buftypes & SA_BONUS)) { + dmu_buf_will_dirty(hdl->sa_bonus, tx); + buftypes |= SA_BONUS; + } + } + if (bulk[i].sa_addr == NULL && + ((error = sa_get_spill(hdl)) == 0)) { + if (TOC_ATTR_PRESENT( + hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_spill_tab, + SA_GET_HDR(hdl, SA_SPILL), + bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); + if (tx && !(buftypes & SA_SPILL) && + bulk[i].sa_size == bulk[i].sa_length) { + dmu_buf_will_dirty(hdl->sa_spill, tx); + buftypes |= SA_SPILL; + } + } + } + if (error && error != ENOENT) { + return ((error == ECKSUM) ? EIO : error); + } + + switch (data_op) { + case SA_LOOKUP: + if (bulk[i].sa_addr == NULL) + return (ENOENT); + if (bulk[i].sa_data) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_addr, bulk[i].sa_data, + bulk[i].sa_size); + } + continue; + + case SA_UPDATE: + /* existing rewrite of attr */ + if (bulk[i].sa_addr && + bulk[i].sa_size == bulk[i].sa_length) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_addr, + bulk[i].sa_length); + continue; + } else if (bulk[i].sa_addr) { /* attr size change */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_REPLACE, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } else { /* adding new attribute */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_ADD, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } + if (error) + return (error); + break; + } + } + return (error); +} + +static sa_lot_t * +sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, + uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, *findtb; + int i; + avl_index_t loc; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); + tb->lot_attr_count = attr_count; + tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + tb->lot_num = lot_num; + tb->lot_hash = hash; + tb->lot_instance = 0; + + if (zapadd) { + char attr_name[8]; + + if (sa->sa_layout_attr_obj == 0) { + sa->sa_layout_attr_obj = zap_create(os, + DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, + &sa->sa_layout_attr_obj, tx) == 0); + } + + (void) snprintf(attr_name, sizeof (attr_name), + "%d", (int)lot_num); + VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, + attr_name, 2, attr_count, attrs, tx)); + } + + list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), + offsetof(sa_idx_tab_t, sa_next)); + + for (i = 0; i != attr_count; i++) { + if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) + tb->lot_var_sizes++; + } + + avl_add(&sa->sa_layout_num_tree, tb); + + /* verify we don't have a hash collision */ + if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { + for (; findtb && findtb->lot_hash == hash; + findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { + if (findtb->lot_instance != tb->lot_instance) + break; + tb->lot_instance++; + } + } + avl_add(&sa->sa_layout_hash_tree, tb); + return (tb); +} + +static void +sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, + int count, dmu_tx_t *tx, sa_lot_t **lot) +{ + sa_lot_t *tb, tbsearch; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + boolean_t found = B_FALSE; + + mutex_enter(&sa->sa_lock); + tbsearch.lot_hash = hash; + tbsearch.lot_instance = 0; + tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); + if (tb) { + for (; tb && tb->lot_hash == hash; + tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { + if (sa_layout_equal(tb, attrs, count) == 0) { + found = B_TRUE; + break; + } + } + } + if (!found) { + tb = sa_add_layout_entry(os, attrs, count, + avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); + } + mutex_exit(&sa->sa_lock); + *lot = tb; +} + +static int +sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) +{ + int error; + uint32_t blocksize; + + if (size == 0) { + blocksize = SPA_MINBLOCKSIZE; + } else if (size > SPA_MAXBLOCKSIZE) { + ASSERT(0); + return (EFBIG); + } else { + blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); + } + + error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); + ASSERT(error == 0); + return (error); +} + +static void +sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) +{ + if (func == NULL) { + bcopy(datastart, target, buflen); + } else { + boolean_t start; + int bytes; + void *dataptr; + void *saptr = target; + uint32_t length; + + start = B_TRUE; + bytes = 0; + while (bytes < buflen) { + func(&dataptr, &length, buflen, start, datastart); + bcopy(dataptr, saptr, length); + saptr = (void *)((caddr_t)saptr + length); + bytes += length; + start = B_FALSE; + } + } +} + +/* + * Determine several different sizes + * first the sa header size + * the number of bytes to be stored + * if spill would occur the index in the attribute array is returned + * + * the boolean will_spill will be set when spilling is necessary. It + * is only set when the buftype is SA_BONUS + */ +static int +sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, + boolean_t *will_spill) +{ + int var_size = 0; + int i; + int full_space; + int hdrsize; + boolean_t done = B_FALSE; + + if (buftype == SA_BONUS && sa->sa_force_spill) { + *total = 0; + *index = 0; + *will_spill = B_TRUE; + return (0); + } + + *index = -1; + *total = 0; + + if (buftype == SA_BONUS) + *will_spill = B_FALSE; + + hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : + sizeof (sa_hdr_phys_t); + + full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; + + for (i = 0; i != attr_count; i++) { + boolean_t is_var_sz; + + *total += attr_desc[i].sa_length; + if (done) + goto next; + + is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); + if (is_var_sz) { + var_size++; + } + + if (is_var_sz && var_size > 1) { + if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + + *total < full_space) { + hdrsize += sizeof (uint16_t); + } else { + done = B_TRUE; + *index = i; + if (buftype == SA_BONUS) + *will_spill = B_TRUE; + continue; + } + } + + /* + * find index of where spill *could* occur. + * Then continue to count of remainder attribute + * space. The sum is used later for sizing bonus + * and spill buffer. + */ + if (buftype == SA_BONUS && *index == -1 && + P2ROUNDUP(*total + hdrsize, 8) > + (full_space - sizeof (blkptr_t))) { + *index = i; + done = B_TRUE; + } + +next: + if (P2ROUNDUP(*total + hdrsize, 8) > full_space && + buftype == SA_BONUS) + *will_spill = B_TRUE; + } + + hdrsize = P2ROUNDUP(hdrsize, 8); + return (hdrsize); +} + +#define BUF_SPACE_NEEDED(total, header) (total + header) + +/* + * Find layout that corresponds to ordering of attributes + * If not found a new layout number is created and added to + * persistent layout tables. + */ +static int +sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + uint64_t hash; + sa_buf_type_t buftype; + sa_hdr_phys_t *sahdr; + void *data_start; + int buf_space; + sa_attr_type_t *attrs, *attrs_start; + int i, lot_count; + int hdrsize, spillhdrsize; + int used; + dmu_object_type_t bonustype; + sa_lot_t *lot; + int len_idx; + int spill_used; + boolean_t spilling; + + dmu_buf_will_dirty(hdl->sa_bonus, tx); + bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); + + /* first determine bonus header size and sum of all attributes */ + hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, + SA_BONUS, &i, &used, &spilling); + + if (used > SPA_MAXBLOCKSIZE) + return (EFBIG); + + VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? + MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : + used + hdrsize, tx)); + + ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || + bonustype == DMU_OT_SA); + + /* setup and size spill buffer when needed */ + if (spilling) { + boolean_t dummy; + + if (hdl->sa_spill == NULL) { + VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, + &hdl->sa_spill) == 0); + } + dmu_buf_will_dirty(hdl->sa_spill, tx); + + spillhdrsize = sa_find_sizes(sa, &attr_desc[i], + attr_count - i, hdl->sa_spill, SA_SPILL, &i, + &spill_used, &dummy); + + if (spill_used > SPA_MAXBLOCKSIZE) + return (EFBIG); + + buf_space = hdl->sa_spill->db_size - spillhdrsize; + if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > + hdl->sa_spill->db_size) + VERIFY(0 == sa_resize_spill(hdl, + BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); + } + + /* setup starting pointers to lay down data */ + data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); + sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + buftype = SA_BONUS; + + if (spilling) + buf_space = (sa->sa_force_spill) ? + 0 : SA_BLKPTR_SPACE - hdrsize; + else + buf_space = hdl->sa_bonus->db_size - hdrsize; + + attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + lot_count = 0; + + for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { + uint16_t length; + + attrs[i] = attr_desc[i].sa_attr; + length = SA_REGISTERED_LEN(sa, attrs[i]); + if (length == 0) + length = attr_desc[i].sa_length; + + if (buf_space < length) { /* switch to spill buffer */ + VERIFY(bonustype == DMU_OT_SA); + if (buftype == SA_BONUS && !sa->sa_force_spill) { + sa_find_layout(hdl->sa_os, hash, attrs_start, + lot_count, tx, &lot); + SA_SET_HDR(sahdr, lot->lot_num, hdrsize); + } + + buftype = SA_SPILL; + hash = -1ULL; + len_idx = 0; + + sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr->sa_magic = SA_MAGIC; + data_start = (void *)((uintptr_t)sahdr + + spillhdrsize); + attrs_start = &attrs[i]; + buf_space = hdl->sa_spill->db_size - spillhdrsize; + lot_count = 0; + } + hash ^= SA_ATTR_HASH(attrs[i]); + attr_desc[i].sa_addr = data_start; + attr_desc[i].sa_size = length; + SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, + data_start, length); + if (sa->sa_attr_table[attrs[i]].sa_length == 0) { + sahdr->sa_lengths[len_idx++] = length; + } + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + length), 8); + buf_space -= P2ROUNDUP(length, 8); + lot_count++; + } + + sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); + + /* + * Verify that old znodes always have layout number 0. + * Must be DMU_OT_SA for arbitrary layouts + */ + VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || + (bonustype == DMU_OT_SA && lot->lot_num > 1)); + + if (bonustype == DMU_OT_SA) { + SA_SET_HDR(sahdr, lot->lot_num, + buftype == SA_BONUS ? hdrsize : spillhdrsize); + } + + kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); + if (hdl->sa_bonus_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + hdl->sa_bonus_tab = NULL; + } + if (!sa->sa_force_spill) + VERIFY(0 == sa_build_index(hdl, SA_BONUS)); + if (hdl->sa_spill) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + if (!spilling) { + /* + * remove spill block that is no longer needed. + */ + dmu_buf_rele(hdl->sa_spill, NULL); + hdl->sa_spill = NULL; + hdl->sa_spill_tab = NULL; + VERIFY(0 == dmu_rm_spill(hdl->sa_os, + sa_handle_object(hdl), tx)); + } else { + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } + } + + return (0); +} + +static void +sa_free_attr_table(sa_os_t *sa) +{ + int i; + + if (sa->sa_attr_table == NULL) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_name) + kmem_free(sa->sa_attr_table[i].sa_name, + strlen(sa->sa_attr_table[i].sa_name) + 1); + } + + kmem_free(sa->sa_attr_table, + sizeof (sa_attr_table_t) * sa->sa_num_attrs); + + sa->sa_attr_table = NULL; +} + +static int +sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) +{ + sa_os_t *sa = os->os_sa; + uint64_t sa_attr_count = 0; + uint64_t sa_reg_count; + int error = 0; + uint64_t attr_value; + sa_attr_table_t *tb; + zap_cursor_t zc; + zap_attribute_t za; + int registered_count = 0; + int i; + dmu_objset_type_t ostype = dmu_objset_type(os); + + sa->sa_user_table = + kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); + sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); + + if (sa->sa_reg_attr_obj != 0) { + error = zap_count(os, sa->sa_reg_attr_obj, + &sa_attr_count); + + /* + * Make sure we retrieved a count and that it isn't zero + */ + if (error || (error == 0 && sa_attr_count == 0)) { + if (error == 0) + error = EINVAL; + goto bail; + } + sa_reg_count = sa_attr_count; + } + + if (ostype == DMU_OST_ZFS && sa_attr_count == 0) + sa_attr_count += sa_legacy_attr_count; + + /* Allocate attribute numbers for attributes that aren't registered */ + for (i = 0; i != count; i++) { + boolean_t found = B_FALSE; + int j; + + if (ostype == DMU_OST_ZFS) { + for (j = 0; j != sa_legacy_attr_count; j++) { + if (strcmp(reg_attrs[i].sa_name, + sa_legacy_attrs[j].sa_name) == 0) { + sa->sa_user_table[i] = + sa_legacy_attrs[j].sa_attr; + found = B_TRUE; + } + } + } + if (found) + continue; + + if (sa->sa_reg_attr_obj) + error = zap_lookup(os, sa->sa_reg_attr_obj, + reg_attrs[i].sa_name, 8, 1, &attr_value); + else + error = ENOENT; + switch (error) { + case ENOENT: + sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; + sa_attr_count++; + break; + case 0: + sa->sa_user_table[i] = ATTR_NUM(attr_value); + break; + default: + goto bail; + } + } + + sa->sa_num_attrs = sa_attr_count; + tb = sa->sa_attr_table = + kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); + + /* + * Attribute table is constructed from requested attribute list, + * previously foreign registered attributes, and also the legacy + * ZPL set of attributes. + */ + + if (sa->sa_reg_attr_obj) { + for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t value; + value = za.za_first_integer; + + registered_count++; + tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); + tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); + tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); + tb[ATTR_NUM(value)].sa_registered = B_TRUE; + + if (tb[ATTR_NUM(value)].sa_name) { + continue; + } + tb[ATTR_NUM(value)].sa_name = + kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); + (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, + strlen(za.za_name) +1); + } + zap_cursor_fini(&zc); + /* + * Make sure we processed the correct number of registered + * attributes + */ + if (registered_count != sa_reg_count) { + ASSERT(error != 0); + goto bail; + } + + } + + if (ostype == DMU_OST_ZFS) { + for (i = 0; i != sa_legacy_attr_count; i++) { + if (tb[i].sa_name) + continue; + tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; + tb[i].sa_length = sa_legacy_attrs[i].sa_length; + tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; + tb[i].sa_registered = B_FALSE; + tb[i].sa_name = + kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, + KM_SLEEP); + (void) strlcpy(tb[i].sa_name, + sa_legacy_attrs[i].sa_name, + strlen(sa_legacy_attrs[i].sa_name) + 1); + } + } + + for (i = 0; i != count; i++) { + sa_attr_type_t attr_id; + + attr_id = sa->sa_user_table[i]; + if (tb[attr_id].sa_name) + continue; + + tb[attr_id].sa_length = reg_attrs[i].sa_length; + tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; + tb[attr_id].sa_attr = attr_id; + tb[attr_id].sa_name = + kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); + (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, + strlen(reg_attrs[i].sa_name) + 1); + } + + sa->sa_need_attr_registration = + (sa_attr_count != registered_count); + + return (0); +bail: + kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); + sa->sa_user_table = NULL; + sa_free_attr_table(sa); + return ((error != 0) ? error : EINVAL); +} + +int +sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, + sa_attr_type_t **user_table) +{ + zap_cursor_t zc; + zap_attribute_t za; + sa_os_t *sa; + dmu_objset_type_t ostype = dmu_objset_type(os); + sa_attr_type_t *tb; + int error; + + mutex_enter(&os->os_lock); + if (os->os_sa) { + mutex_enter(&os->os_sa->sa_lock); + mutex_exit(&os->os_lock); + tb = os->os_sa->sa_user_table; + mutex_exit(&os->os_sa->sa_lock); + *user_table = tb; + return (0); + } + + sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); + mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); + sa->sa_master_obj = sa_obj; + + os->os_sa = sa; + mutex_enter(&sa->sa_lock); + mutex_exit(&os->os_lock); + avl_create(&sa->sa_layout_num_tree, layout_num_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); + avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); + + if (sa_obj) { + error = zap_lookup(os, sa_obj, SA_LAYOUTS, + 8, 1, &sa->sa_layout_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + error = zap_lookup(os, sa_obj, SA_REGISTRY, + 8, 1, &sa->sa_reg_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + } + + if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) + goto fail; + + if (sa->sa_layout_attr_obj != 0) { + uint64_t layout_count; + + error = zap_count(os, sa->sa_layout_attr_obj, + &layout_count); + + /* + * Layout number count should be > 0 + */ + if (error || (error == 0 && layout_count == 0)) { + if (error == 0) + error = EINVAL; + goto fail; + } + + for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + sa_attr_type_t *lot_attrs; + uint64_t lot_num; + + lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * + za.za_num_integers, KM_SLEEP); + + if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, + za.za_name, 2, za.za_num_integers, + lot_attrs))) != 0) { + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + break; + } + VERIFY(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num) == 0); + + (void) sa_add_layout_entry(os, lot_attrs, + za.za_num_integers, lot_num, + sa_layout_info_hash(lot_attrs, + za.za_num_integers), B_FALSE, NULL); + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + } + zap_cursor_fini(&zc); + + /* + * Make sure layout count matches number of entries added + * to AVL tree + */ + if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { + ASSERT(error != 0); + goto fail; + } + } + + /* Add special layout number for old ZNODES */ + if (ostype == DMU_OST_ZFS) { + (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, + sa_legacy_attr_count, 0, + sa_layout_info_hash(sa_legacy_zpl_layout, + sa_legacy_attr_count), B_FALSE, NULL); + + (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, + 0, B_FALSE, NULL); + } + *user_table = os->os_sa->sa_user_table; + mutex_exit(&sa->sa_lock); + return (0); +fail: + os->os_sa = NULL; + sa_free_attr_table(sa); + if (sa->sa_user_table) + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + mutex_exit(&sa->sa_lock); + kmem_free(sa, sizeof (sa_os_t)); + return ((error == ECKSUM) ? EIO : error); +} + +void +sa_tear_down(objset_t *os) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *layout; + void *cookie; + + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + + /* Free up attr table */ + + sa_free_attr_table(sa); + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { + sa_idx_tab_t *tab; + while (tab = list_head(&layout->lot_idx_tab)) { + ASSERT(refcount_count(&tab->sa_refcount)); + sa_idx_tab_rele(os, tab); + } + } + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { + kmem_free(layout->lot_attrs, + sizeof (sa_attr_type_t) * layout->lot_attr_count); + kmem_free(layout, sizeof (sa_lot_t)); + } + + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + + kmem_free(sa, sizeof (sa_os_t)); + os->os_sa = NULL; +} + +void +sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t var_length, void *userp) +{ + sa_idx_tab_t *idx_tab = userp; + + if (var_length) { + ASSERT(idx_tab->sa_variable_lengths); + idx_tab->sa_variable_lengths[length_idx] = length; + } + TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, + (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); +} + +static void +sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, + sa_iterfunc_t func, sa_lot_t *tab, void *userp) +{ + void *data_start; + sa_lot_t *tb = tab; + sa_lot_t search; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + int i; + uint16_t *length_start = NULL; + uint8_t length_idx = 0; + + if (tab == NULL) { + search.lot_num = SA_LAYOUT_NUM(hdr, type); + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + ASSERT(tb); + } + + if (IS_SA_BONUSTYPE(type)) { + data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + + offsetof(sa_hdr_phys_t, sa_lengths) + + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); + length_start = hdr->sa_lengths; + } else { + data_start = hdr; + } + + for (i = 0; i != tb->lot_attr_count; i++) { + int attr_length, reg_length; + uint8_t idx_len; + + reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; + if (reg_length) { + attr_length = reg_length; + idx_len = 0; + } else { + attr_length = length_start[length_idx]; + idx_len = length_idx++; + } + + func(hdr, data_start, tb->lot_attrs[i], attr_length, + idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); + + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + attr_length), 8); + } +} + +/*ARGSUSED*/ +void +sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t variable_length, void *userp) +{ + sa_handle_t *hdl = userp; + sa_os_t *sa = hdl->sa_os->os_sa; + + sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); +} + +void +sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); + dmu_buf_impl_t *db; + sa_os_t *sa = hdl->sa_os->os_sa; + int num_lengths = 1; + int i; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + if (sa_hdr_phys->sa_magic == SA_MAGIC) + return; + + db = SA_GET_DB(hdl, buftype); + + if (buftype == SA_SPILL) { + arc_release(db->db_buf, NULL); + arc_buf_thaw(db->db_buf); + } + + sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); + sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); + + /* + * Determine number of variable lenghts in header + * The standard 8 byte header has one for free and a + * 16 byte header would have 4 + 1; + */ + if (SA_HDR_SIZE(sa_hdr_phys) > 8) + num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; + for (i = 0; i != num_lengths; i++) + sa_hdr_phys->sa_lengths[i] = + BSWAP_16(sa_hdr_phys->sa_lengths[i]); + + sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, + sa_byteswap_cb, NULL, hdl); + + if (buftype == SA_SPILL) + arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); +} + +static int +sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys; + dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); + dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); + sa_os_t *sa = hdl->sa_os->os_sa; + sa_idx_tab_t *idx_tab; + + sa_hdr_phys = SA_GET_HDR(hdl, buftype); + + mutex_enter(&sa->sa_lock); + + /* Do we need to byteswap? */ + + /* only check if not old znode */ + if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && + sa_hdr_phys->sa_magic != 0) { + VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); + sa_byteswap(hdl, buftype); + } + + idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); + + if (buftype == SA_BONUS) + hdl->sa_bonus_tab = idx_tab; + else + hdl->sa_spill_tab = idx_tab; + + mutex_exit(&sa->sa_lock); + return (0); +} + +/*ARGSUSED*/ +void +sa_evict(dmu_buf_t *db, void *sap) +{ + panic("evicting sa dbuf %p\n", (void *)db); +} + +static void +sa_idx_tab_rele(objset_t *os, void *arg) +{ + sa_os_t *sa = os->os_sa; + sa_idx_tab_t *idx_tab = arg; + + if (idx_tab == NULL) + return; + + mutex_enter(&sa->sa_lock); + if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { + list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); + if (idx_tab->sa_variable_lengths) + kmem_free(idx_tab->sa_variable_lengths, + sizeof (uint16_t) * + idx_tab->sa_layout->lot_var_sizes); + refcount_destroy(&idx_tab->sa_refcount); + kmem_free(idx_tab->sa_idx_tab, + sizeof (uint32_t) * sa->sa_num_attrs); + kmem_free(idx_tab, sizeof (sa_idx_tab_t)); + } + mutex_exit(&sa->sa_lock); +} + +static void +sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) +{ + sa_os_t *sa = os->os_sa; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + (void) refcount_add(&idx_tab->sa_refcount, NULL); +} + +void +sa_handle_destroy(sa_handle_t *hdl) +{ + mutex_enter(&hdl->sa_lock); + (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, + NULL, NULL, NULL); + + if (hdl->sa_bonus_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + hdl->sa_bonus_tab = NULL; + } + if (hdl->sa_spill_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + hdl->sa_spill_tab = NULL; + } + + dmu_buf_rele(hdl->sa_bonus, NULL); + + if (hdl->sa_spill) + dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + mutex_exit(&hdl->sa_lock); + + kmem_cache_free(sa_cache, hdl); +} + +int +sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + int error = 0; + dmu_object_info_t doi; + sa_handle_t *handle; + +#ifdef ZFS_DEBUG + dmu_object_info_from_db(db, &doi); + ASSERT(doi.doi_bonus_type == DMU_OT_SA || + doi.doi_bonus_type == DMU_OT_ZNODE); +#endif + /* find handle, if it exists */ + /* if one doesn't exist then create a new one, and initialize it */ + + handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; + if (handle == NULL) { + sa_handle_t *newhandle; + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + handle->sa_userp = userp; + handle->sa_bonus = db; + handle->sa_os = os; + handle->sa_spill = NULL; + + error = sa_build_index(handle, SA_BONUS); + newhandle = (hdl_type == SA_HDL_SHARED) ? + dmu_buf_set_user_ie(db, handle, + NULL, sa_evict) : NULL; + + if (newhandle != NULL) { + kmem_cache_free(sa_cache, handle); + handle = newhandle; + } + } + *handlepp = handle; + + return (error); +} + +int +sa_handle_get(objset_t *objset, uint64_t objid, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + dmu_buf_t *db; + int error; + + if (error = dmu_bonus_hold(objset, objid, NULL, &db)) + return (error); + + return (sa_handle_get_from_db(objset, db, userp, hdl_type, + handlepp)); +} + +int +sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) +{ + return (dmu_bonus_hold(objset, obj_num, tag, db)); +} + +void +sa_buf_rele(dmu_buf_t *db, void *tag) +{ + dmu_buf_rele(db, tag); +} + +int +sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); +} + +int +sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = buf; + bulk.sa_length = buflen; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_lookup_impl(hdl, &bulk, 1); + mutex_exit(&hdl->sa_lock); + return (error); +} + +#ifdef _KERNEL +int +sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { + error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, + uio->uio_resid), UIO_READ, uio); + } + mutex_exit(&hdl->sa_lock); + return (error); + +} +#endif + +void * +sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data) +{ + sa_idx_tab_t *idx_tab; + sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data; + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, search; + avl_index_t loc; + + /* + * Deterimine layout number. If SA node and header == 0 then + * force the index table to the dummy "1" empty layout. + * + * The layout number would only be zero for a newly created file + * that has not added any attributes yet, or with crypto enabled which + * doesn't write any attributes to the bonus buffer. + */ + + search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); + + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + + /* Verify header size is consistent with layout information */ + ASSERT(tb); + ASSERT(IS_SA_BONUSTYPE(bonustype) && + SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || + (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); + + /* + * See if any of the already existing TOC entries can be reused? + */ + + for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; + idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { + boolean_t valid_idx = B_TRUE; + int i; + + if (tb->lot_var_sizes != 0 && + idx_tab->sa_variable_lengths != NULL) { + for (i = 0; i != tb->lot_var_sizes; i++) { + if (hdr->sa_lengths[i] != + idx_tab->sa_variable_lengths[i]) { + valid_idx = B_FALSE; + break; + } + } + } + if (valid_idx) { + sa_idx_tab_hold(os, idx_tab); + return (idx_tab); + } + } + + /* No such luck, create a new entry */ + idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); + idx_tab->sa_idx_tab = + kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); + idx_tab->sa_layout = tb; + refcount_create(&idx_tab->sa_refcount); + if (tb->lot_var_sizes) + idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * + tb->lot_var_sizes, KM_SLEEP); + + sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, + tb, idx_tab); + sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ + sa_idx_tab_hold(os, idx_tab); /* one for layout */ + list_insert_tail(&tb->lot_idx_tab, idx_tab); + return (idx_tab); +} + +void +sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, + boolean_t start, void *userdata) +{ + ASSERT(start); + + *dataptr = userdata; + *len = total_len; +} + +static void +sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) +{ + uint64_t attr_value = 0; + sa_os_t *sa = hdl->sa_os->os_sa; + sa_attr_table_t *tb = sa->sa_attr_table; + int i; + + mutex_enter(&sa->sa_lock); + + if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { + mutex_exit(&sa->sa_lock); + return; + } + + if (sa->sa_reg_attr_obj == 0) { + sa->sa_reg_attr_obj = zap_create(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj, + SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0); + } + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_registered) + continue; + ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, + tb[i].sa_byteswap); + VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, + tb[i].sa_name, 8, 1, &attr_value, tx)); + tb[i].sa_registered = B_TRUE; + } + sa->sa_need_attr_registration = B_FALSE; + mutex_exit(&sa->sa_lock); +} + +/* + * Replace all attributes with attributes specified in template. + * If dnode had a spill buffer then those attributes will be + * also be replaced, possibly with just an empty spill block + * + * This interface is intended to only be used for bulk adding of + * attributes for a new file. It will also be used by the ZPL + * when converting and old formatted znode to native SA support. + */ +int +sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); +} + +int +sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_replace_all_by_template_locked(hdl, attr_desc, + attr_count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * add/remove/replace a single attribute and then rewrite the entire set + * of attributes. + */ +static int +sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + sa_bulk_attr_t *attr_desc; + void *old_data[2]; + int bonus_attr_count = 0; + int bonus_data_size, spill_data_size; + int spill_attr_count = 0; + int error; + uint16_t length; + int i, j, k, length_idx; + sa_hdr_phys_t *hdr; + sa_idx_tab_t *idx_tab; + int attr_count; + int count; + + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* First make of copy of the old data */ + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_bonuslen != 0) { + bonus_data_size = hdl->sa_bonus->db_size; + old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); + bcopy(hdl->sa_bonus->db_data, old_data[0], + hdl->sa_bonus->db_size); + bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; + } else { + old_data[0] = NULL; + } + DB_DNODE_EXIT(db); + + /* Bring spill buffer online if it isn't currently */ + + if ((error = sa_get_spill(hdl)) == 0) { + spill_data_size = hdl->sa_spill->db_size; + old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); + bcopy(hdl->sa_spill->db_data, old_data[1], + hdl->sa_spill->db_size); + spill_attr_count = + hdl->sa_spill_tab->sa_layout->lot_attr_count; + } else if (error && error != ENOENT) { + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + return (error); + } else { + old_data[1] = NULL; + } + + /* build descriptor of all attributes */ + + attr_count = bonus_attr_count + spill_attr_count; + if (action == SA_ADD) + attr_count++; + else if (action == SA_REMOVE) + attr_count--; + + attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); + + /* + * loop through bonus and spill buffer if it exists, and + * build up new attr_descriptor to reset the attributes + */ + k = j = 0; + count = bonus_attr_count; + hdr = SA_GET_HDR(hdl, SA_BONUS); + idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); + for (; k != 2; k++) { + /* iterate over each attribute in layout */ + for (i = 0, length_idx = 0; i != count; i++) { + sa_attr_type_t attr; + + attr = idx_tab->sa_layout->lot_attrs[i]; + if (attr == newattr) { + if (action == SA_REMOVE) { + j++; + continue; + } + ASSERT(SA_REGISTERED_LEN(sa, attr) == 0); + ASSERT(action == SA_REPLACE); + SA_ADD_BULK_ATTR(attr_desc, j, attr, + locator, datastart, buflen); + } else { + length = SA_REGISTERED_LEN(sa, attr); + if (length == 0) { + length = hdr->sa_lengths[length_idx++]; + } + + SA_ADD_BULK_ATTR(attr_desc, j, attr, + NULL, (void *) + (TOC_OFF(idx_tab->sa_idx_tab[attr]) + + (uintptr_t)old_data[k]), length); + } + } + if (k == 0 && hdl->sa_spill) { + hdr = SA_GET_HDR(hdl, SA_SPILL); + idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); + count = spill_attr_count; + } else { + break; + } + } + if (action == SA_ADD) { + length = SA_REGISTERED_LEN(sa, newattr); + if (length == 0) { + length = buflen; + } + SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, + datastart, buflen); + } + + error = sa_build_layouts(hdl, attr_desc, attr_count, tx); + + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + if (old_data[1]) + kmem_free(old_data[1], spill_data_size); + kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); + + return (error); +} + +static int +sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + dmu_tx_t *tx) +{ + int error; + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_object_type_t bonustype; + + bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); + + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* sync out registration table if necessary */ + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + + error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); + if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) + sa->sa_update_cb(hdl, tx); + + return (error); +} + +/* + * update or add new attribute + */ +int +sa_update(sa_handle_t *hdl, sa_attr_type_t type, + void *buf, uint32_t buflen, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = type; + bulk.sa_data_func = NULL; + bulk.sa_length = buflen; + bulk.sa_data = buf; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, + uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = userdata; + bulk.sa_data_func = locator; + bulk.sa_length = buflen; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Return size of an attribute + */ + +int +sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) +{ + sa_bulk_attr_t bulk; + int error; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { + mutex_exit(&hdl->sa_lock); + return (error); + } + *size = bulk.sa_size; + + mutex_exit(&hdl->sa_lock); + return (0); +} + +int +sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_lookup_impl(hdl, attrs, count)); +} + +int +sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_lookup_locked(hdl, attrs, count); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, attrs, count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, + NULL, 0, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +void +sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) +{ + dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); +} + +void +sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) +{ + dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, + blksize, nblocks); +} + +void +sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) +{ + (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, + oldhdl, newhdl, NULL, sa_evict); + oldhdl->sa_bonus = NULL; +} + +void +sa_set_userp(sa_handle_t *hdl, void *ptr) +{ + hdl->sa_userp = ptr; +} + +dmu_buf_t * +sa_get_db(sa_handle_t *hdl) +{ + return ((dmu_buf_t *)hdl->sa_bonus); +} + +void * +sa_get_userdata(sa_handle_t *hdl) +{ + return (hdl->sa_userp); +} + +void +sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) +{ + ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); + os->os_sa->sa_update_cb = func; +} + +void +sa_register_update_callback(objset_t *os, sa_update_cb_t *func) +{ + + mutex_enter(&os->os_sa->sa_lock); + sa_register_update_callback_locked(os, func); + mutex_exit(&os->os_sa->sa_lock); +} + +uint64_t +sa_handle_object(sa_handle_t *hdl) +{ + return (hdl->sa_bonus->db_object); +} + +boolean_t +sa_enabled(objset_t *os) +{ + return (os->os_sa == NULL); +} + +int +sa_set_sa_object(objset_t *os, uint64_t sa_object) +{ + sa_os_t *sa = os->os_sa; + + if (sa->sa_master_obj) + return (1); + + sa->sa_master_obj = sa_object; + + return (0); +} + +int +sa_hdrsize(void *arg) +{ + sa_hdr_phys_t *hdr = arg; + + return (SA_HDR_SIZE(hdr)); +} + +void +sa_handle_lock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); +} + +void +sa_handle_unlock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_exit(&hdl->sa_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c index ca7076c..816c09a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c @@ -19,111 +19,36 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include -#include - -/* - * SHA-256 checksum, as specified in FIPS 180-3, available at: - * http://csrc.nist.gov/publications/PubsFIPS.html - * - * This is a very compact implementation of SHA-256. - * It is designed to be simple and portable, not to be fast. - */ - -/* - * The literal definitions of Ch() and Maj() according to FIPS 180-3 are: - * - * Ch(x, y, z) (x & y) ^ (~x & z) - * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z) - * - * We use equivalent logical reductions here that require one less op. - */ -#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) -#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) -#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) -#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) -#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) -#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) - -static const uint32_t SHA256_K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -static void -SHA256Transform(uint32_t *H, const uint8_t *cp) -{ - uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; - - for (t = 0; t < 16; t++, cp += 4) - W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; - - for (t = 16; t < 64; t++) - W[t] = sigma1(W[t - 2]) + W[t - 7] + - sigma0(W[t - 15]) + W[t - 16]; - - a = H[0]; b = H[1]; c = H[2]; d = H[3]; - e = H[4]; f = H[5]; g = H[6]; h = H[7]; - - for (t = 0; t < 64; t++) { - T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; - T2 = SIGMA0(a) + Maj(a, b, c); - h = g; g = f; f = e; e = d + T1; - d = c; c = b; b = a; a = T1 + T2; - } - - H[0] += a; H[1] += b; H[2] += c; H[3] += d; - H[4] += e; H[5] += f; H[6] += g; H[7] += h; -} +#ifdef _KERNEL +#include +#else +#include +#endif void zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) { - uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; - uint8_t pad[128]; - int i, padsize; - - for (i = 0; i < (size & ~63ULL); i += 64) - SHA256Transform(H, (uint8_t *)buf + i); - - for (padsize = 0; i < size; i++) - pad[padsize++] = *((uint8_t *)buf + i); - - for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) - pad[padsize] = 0; - - for (i = 56; i >= 0; i -= 8) - pad[padsize++] = (size << 3) >> i; - - for (i = 0; i < padsize; i += 64) - SHA256Transform(H, pad + i); - - ZIO_SET_CHECKSUM(zcp, - (uint64_t)H[0] << 32 | H[1], - (uint64_t)H[2] << 32 | H[3], - (uint64_t)H[4] << 32 | H[5], - (uint64_t)H[6] << 32 | H[7]); + SHA256_CTX ctx; + zio_cksum_t tmp; + + SHA256_Init(&ctx); + SHA256_Update(&ctx, buf, size); + SHA256_Final((unsigned char *)&tmp, &ctx); + + /* + * A prior implementation of this function had a + * private SHA256 implementation always wrote things out in + * Big Endian and there wasn't a byteswap variant of it. + * To preseve on disk compatibility we need to force that + * behaviour. + */ + zcp->zc_word[0] = BE_64(tmp.zc_word[0]); + zcp->zc_word[1] = BE_64(tmp.zc_word[1]); + zcp->zc_word[2] = BE_64(tmp.zc_word[2]); + zcp->zc_word[3] = BE_64(tmp.zc_word[3]); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index c04102e..9336a6b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -35,13 +34,14 @@ #include #include #include -#include #include #include #include #include +#include #include #include +#include #include #include #include @@ -56,8 +56,16 @@ #include #include #include -#include #include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" @@ -70,17 +78,17 @@ TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, "Check hostid on import?"); -enum zti_modes { +typedef enum zti_modes { zti_mode_fixed, /* value is # of threads (min 1) */ zti_mode_online_percent, /* value is % of online CPUs */ - zti_mode_tune, /* fill from zio_taskq_tune_* */ + zti_mode_batch, /* cpu-intensive; value is ignored */ zti_mode_null, /* don't create a taskq */ zti_nmodes -}; +} zti_modes_t; #define ZTI_FIX(n) { zti_mode_fixed, (n) } #define ZTI_PCT(n) { zti_mode_online_percent, (n) } -#define ZTI_TUNE { zti_mode_tune, 0 } +#define ZTI_BATCH { zti_mode_batch, 0 } #define ZTI_NULL { zti_mode_null, 0 } #define ZTI_ONE ZTI_FIX(1) @@ -91,7 +99,7 @@ typedef struct zio_taskq_info { } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { - "issue", "issue_high", "intr", "intr_high" + "issue", "issue_high", "intr", "intr_high" }; /* @@ -101,18 +109,36 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, - { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, + { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, + { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; -enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; -uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ - -static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); +static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); +static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + char **ereport); +static void spa_vdev_resilver_done(spa_t *spa); + +uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ +#ifdef PSRSET_BIND +id_t zio_taskq_psrset_bind = PS_NONE; +#endif +#ifdef SYSDC +boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ +#endif +uint_t zio_taskq_basedc = 80; /* base duty cycle */ + +boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" /* * ========================================================================== @@ -149,7 +175,7 @@ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { uint64_t size; - uint64_t used; + uint64_t alloc; uint64_t cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; @@ -157,17 +183,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (spa->spa_root_vdev != NULL) { - size = spa_get_space(spa); - used = spa_get_alloc(spa); + alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, - size - used, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, + size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + (spa_mode(spa) == FREAD), src); - cap = (size == 0) ? 0 : (used * 100 / size); + cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, + ddt_get_pool_dedup_ratio(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, spa->spa_root_vdev->vdev_state, src); @@ -202,9 +233,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) int spa_prop_get(spa_t *spa, nvlist_t **nvp) { + objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; - objset_t *mos = spa->spa_meta_objset; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); @@ -217,7 +248,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ - if (spa->spa_pool_props_object == 0) { + if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } @@ -338,6 +369,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: + case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = EINVAL; @@ -375,12 +407,14 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) break; } - if (error = dmu_objset_open(strval, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &os)) + if (error = dmu_objset_hold(strval, FTAG, &os)) break; - /* We don't support gzip bootable datasets */ - if ((error = dsl_prop_get_integer(strval, + /* Must be ZPL and not gzip compressed. */ + + if (dmu_objset_type(os) != DMU_OST_ZFS) { + error = ENOTSUP; + } else if ((error = dsl_prop_get_integer(strval, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL)) == 0 && !BOOTFS_COMPRESS_VALID(compress)) { @@ -388,7 +422,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) } else { objnum = dmu_objset_id(os); } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); } break; @@ -436,6 +470,16 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) strcmp(slash, "/..") == 0) error = EINVAL; break; + + case ZPOOL_PROP_DEDUPDITTO: + if (spa_version(spa) < SPA_VERSION_DEDUP) + error = ENOTSUP; + else + error = nvpair_value_uint64(elem, &intval); + if (error == 0 && + intval != 0 && intval < ZIO_DEDUPDITTO_MIN) + error = EINVAL; + break; } if (error) @@ -497,7 +541,9 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) nvpair_name(elem))) == ZPROP_INVAL) return (EINVAL); - if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) + if (prop == ZPOOL_PROP_CACHEFILE || + prop == ZPOOL_PROP_ALTROOT || + prop == ZPOOL_PROP_READONLY) continue; need_sync = B_TRUE; @@ -569,20 +615,55 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) offsetof(spa_error_entry_t, se_avl)); } -/* - * Activate an uninitialized pool. - */ -static void -spa_activate(spa_t *spa, int mode) +static taskq_t * +spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, + uint_t value) { - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + uint_t flags = TASKQ_PREPOPULATE; + boolean_t batch = B_FALSE; - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_mode = mode; + switch (mode) { + case zti_mode_null: + return (NULL); /* no taskq needed */ - spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops); - spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); + case zti_mode_fixed: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + break; + + case zti_mode_batch: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = zio_taskq_batch_pct; + break; + + case zti_mode_online_percent: + flags |= TASKQ_THREADS_CPU_PCT; + break; + + default: + panic("unrecognized mode for %s taskq (%u:%u) in " + "spa_activate()", + name, mode, value); + break; + } +#ifdef SYSDC + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + return (taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags)); + } +#endif + return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, + spa->spa_proc, flags)); +} + +static void +spa_create_zio_taskqs(spa_t *spa) +{ for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; @@ -593,43 +674,137 @@ spa_activate(spa_t *spa, int mode) (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); - if (mode == zti_mode_tune) { - mode = zio_taskq_tune_mode; - value = zio_taskq_tune_value; - if (mode == zti_mode_tune) - mode = zti_mode_online_percent; - } + spa->spa_zio_taskq[t][q] = + spa_taskq_create(spa, name, mode, value); + } + } +} - switch (mode) { - case zti_mode_fixed: - ASSERT3U(value, >=, 1); - value = MAX(value, 1); +#ifdef _KERNEL +#ifdef SPA_PROCESS +static void +spa_thread(void *arg) +{ + callb_cpr_t cprinfo; - spa->spa_zio_taskq[t][q] = taskq_create(name, - value, maxclsyspri, 50, INT_MAX, - TASKQ_PREPOPULATE); - break; + spa_t *spa = arg; + user_t *pu = PTOU(curproc); + + CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, + spa->spa_name); + + ASSERT(curproc != &p0); + (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), + "zpool-%s", spa->spa_name); + (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); + +#ifdef PSRSET_BIND + /* bind this thread to the requested psrset */ + if (zio_taskq_psrset_bind != PS_NONE) { + pool_lock(); + mutex_enter(&cpu_lock); + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, + 0, NULL, NULL) == 0) { + curthread->t_bind_pset = zio_taskq_psrset_bind; + } else { + cmn_err(CE_WARN, + "Couldn't bind process for zfs pool \"%s\" to " + "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); + } - case zti_mode_online_percent: - spa->spa_zio_taskq[t][q] = taskq_create(name, - value, maxclsyspri, 50, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); - break; + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + mutex_exit(&cpu_lock); + pool_unlock(); + } +#endif - case zti_mode_null: - spa->spa_zio_taskq[t][q] = NULL; - break; +#ifdef SYSDC + if (zio_taskq_sysdc) { + sysdc_thread_enter(curthread, 100, 0); + } +#endif - case zti_mode_tune: - default: - panic("unrecognized mode for " - "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " - "in spa_activate()", - t, q, mode, value); - break; + spa->spa_proc = curproc; + spa->spa_did = curthread->t_did; + + spa_create_zio_taskqs(spa); + + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); + + spa->spa_proc_state = SPA_PROC_ACTIVE; + cv_broadcast(&spa->spa_proc_cv); + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (spa->spa_proc_state == SPA_PROC_ACTIVE) + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); + + ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); + spa->spa_proc_state = SPA_PROC_GONE; + spa->spa_proc = &p0; + cv_broadcast(&spa->spa_proc_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ + + mutex_enter(&curproc->p_lock); + lwp_exit(); +} +#endif /* SPA_PROCESS */ +#endif + +/* + * Activate an uninitialized pool. + */ +static void +spa_activate(spa_t *spa, int mode) +{ + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_mode = mode; + + spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + + /* Try to create a covering process */ + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_NONE); + ASSERT(spa->spa_proc == &p0); + spa->spa_did = 0; + +#ifdef SPA_PROCESS + /* Only create a process if we're going to be around a while. */ + if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { + if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, + NULL, 0) == 0) { + spa->spa_proc_state = SPA_PROC_CREATED; + while (spa->spa_proc_state == SPA_PROC_CREATED) { + cv_wait(&spa->spa_proc_cv, + &spa->spa_proc_lock); } + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + ASSERT(spa->spa_proc != &p0); + ASSERT(spa->spa_did != 0); + } else { +#ifdef _KERNEL + cmn_err(CE_WARN, + "Couldn't create process for zfs pool \"%s\"\n", + spa->spa_name); +#endif } } +#endif /* SPA_PROCESS */ + mutex_exit(&spa->spa_proc_lock); + + /* If we didn't create a process, we need to create our taskqs. */ + ASSERT(spa->spa_proc == &p0); + if (spa->spa_proc == &p0) { + spa_create_zio_taskqs(spa); + } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); @@ -688,6 +863,33 @@ spa_deactivate(spa_t *spa) avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; + + mutex_enter(&spa->spa_proc_lock); + if (spa->spa_proc_state != SPA_PROC_NONE) { + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + spa->spa_proc_state = SPA_PROC_DEACTIVATE; + cv_broadcast(&spa->spa_proc_cv); + while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { + ASSERT(spa->spa_proc != &p0); + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_GONE); + spa->spa_proc_state = SPA_PROC_NONE; + } + ASSERT(spa->spa_proc == &p0); + mutex_exit(&spa->spa_proc_lock); + +#ifdef SPA_PROCESS + /* + * We want to make sure spa_thread() has actually exited the ZFS + * module, so that the module can't be unloaded out from underneath + * it. + */ + if (spa->spa_did != 0) { + thread_join(spa->spa_did); + spa->spa_did = 0; + } +#endif /* SPA_PROCESS */ } /* @@ -701,7 +903,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; - uint_t c, children; + uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) @@ -722,7 +924,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, return (EINVAL); } - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { @@ -768,14 +970,19 @@ spa_unload(spa_t *spa) spa->spa_async_zio_root = NULL; } + bpobj_close(&spa->spa_deferred_bpobj); + /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; + spa->spa_meta_objset = NULL; } + ddt_unload(spa); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -928,7 +1135,7 @@ spa_load_spares(spa_t *spa) KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, - spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); + spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) @@ -950,7 +1157,7 @@ spa_load_l2cache(spa_t *spa) nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; - uint64_t guid, size; + uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; @@ -1014,12 +1221,8 @@ spa_load_l2cache(spa_t *spa) (void) vdev_validate_aux(vd); - if (!vdev_is_dead(vd)) { - size = vdev_get_rsize(vd); - l2arc_add_vdev(spa, vd, - VDEV_LABEL_START_SIZE, - size - VDEV_LABEL_START_SIZE); - } + if (!vdev_is_dead(vd)) + l2arc_add_vdev(spa, vd); } } @@ -1058,7 +1261,7 @@ spa_load_l2cache(spa_t *spa) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, - sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); + sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: @@ -1098,9 +1301,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) static void spa_check_removed(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { @@ -1110,36 +1311,131 @@ spa_check_removed(vdev_t *vd) } /* - * Load the slog device state from the config object since it's possible - * that the label does not contain the most up-to-date information. + * Validate the current config against the MOS config */ -void -spa_load_log_state(spa_t *spa) +static boolean_t +spa_config_valid(spa_t *spa, nvlist_t *config) { - nvlist_t *nv, *nvroot, **child; - uint64_t is_log; - uint_t children, c; - vdev_t *rvd = spa->spa_root_vdev; + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv; - VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0); - VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); - for (c = 0; c < children; c++) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); + + /* + * If we're doing a normal import, then build up any additional + * diagnostic information about missing devices in this config. + * We'll pass this up to the user for further processing. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { + nvlist_t **child, *nv; + uint64_t idx = 0; + + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), + KM_SLEEP); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops && + mtvd->vdev_islog) + child[idx++] = vdev_config_generate(spa, mtvd, + B_FALSE, 0); + } + + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); + + for (int i = 0; i < idx; i++) + nvlist_free(child[i]); + } + nvlist_free(nv); + kmem_free(child, rvd->vdev_children * sizeof (char **)); + } + + /* + * Compare the root vdev tree with the information we have + * from the MOS config (mrvd). Check each top-level vdev + * with the corresponding MOS config top-level (mtvd). + */ + for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + /* + * Resolve any "missing" vdevs in the current configuration. + * If we find that the MOS config has more accurate information + * about the top-level vdev then use that vdev instead. + */ + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) { + + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) + continue; + + /* + * Device specific actions. + */ + if (mtvd->vdev_islog) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * XXX - once we have 'readonly' pool + * support we should be able to handle + * missing data devices by transitioning + * the pool to readonly. + */ + continue; + } + + /* + * Swap the missing vdev with the data we were + * able to obtain from the MOS config. + */ + vdev_remove_child(rvd, tvd); + vdev_remove_child(mrvd, mtvd); + + vdev_add_child(rvd, mtvd); + vdev_add_child(mrvd, tvd); + + spa_config_exit(spa, SCL_ALL, FTAG); + vdev_load(mtvd); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log) == 0 && is_log) - vdev_load_log_state(tvd, child[c]); + vdev_reopen(rvd); + } else if (mtvd->vdev_islog) { + /* + * Load the slog device's state from the MOS config + * since it's possible that the label does not + * contain the most up-to-date information. + */ + vdev_load_log_state(tvd, mtvd); + vdev_reopen(tvd); + } } - nvlist_free(nv); + vdev_free(mrvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Ensure we were able to validate the config. + */ + return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ -int +static int spa_check_logs(spa_t *spa) { switch (spa->spa_log_state) { @@ -1148,7 +1444,7 @@ spa_check_logs(spa_t *spa) case SPA_LOG_UNKNOWN: if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, DS_FIND_CHILDREN)) { - spa->spa_log_state = SPA_LOG_MISSING; + spa_set_log_state(spa, SPA_LOG_MISSING); return (1); } break; @@ -1156,109 +1452,439 @@ spa_check_logs(spa_t *spa) return (0); } -/* - * Load an existing storage pool, using the pool's builtin spa_config as a - * source of configuration information. - */ -static int -spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) +static boolean_t +spa_passivate_log(spa_t *spa) { - int error = 0; - nvlist_t *nvroot = NULL; - vdev_t *rvd; - uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; - uint64_t pool_guid; - uint64_t version; - uint64_t autoreplace = 0; - int orig_mode = spa->spa_mode; - char *ereport = FM_EREPORT_ZFS_POOL; + vdev_t *rvd = spa->spa_root_vdev; + boolean_t slog_found = B_FALSE; - /* - * If this is an untrusted config, access the pool in read-only mode. - * This prevents things like resilvering recently removed devices. - */ - if (!mosconfig) - spa->spa_mode = FREAD; + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (!spa_has_slogs(spa)) + return (B_FALSE); - spa->spa_load_state = state; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { - error = EINVAL; - goto out; + if (tvd->vdev_islog) { + metaslab_group_passivate(mg); + slog_found = B_TRUE; + } } - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) - version = SPA_VERSION_INITIAL; + return (slog_found); +} - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); +static void +spa_activate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; - if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { - error = EEXIST; - goto out; + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) + metaslab_group_activate(mg); } +} - spa->spa_load_guid = pool_guid; +int +spa_offline_log(spa_t *spa) +{ + int error = 0; - /* - * Create "The Godfather" zio to hold all async IOs - */ - spa->spa_async_zio_root = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN)) == 0) { - /* - * Parse the configuration into a vdev tree. We explicitly set the - * value that will be returned by spa_version() since parsing the - * configuration requires knowing the version number. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa->spa_ubsync.ub_version = version; - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); - spa_config_exit(spa, SCL_ALL, FTAG); + /* + * We successfully offlined the log device, sync out the + * current txg so that the "stubby" block can be removed + * by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + return (error); +} - if (error != 0) - goto out; +static void +spa_aux_check_removed(spa_aux_vdev_t *sav) +{ + int i; - ASSERT(spa->spa_root_vdev == rvd); - ASSERT(spa_guid(spa) == pool_guid); + for (i = 0; i < sav->sav_count; i++) + spa_check_removed(sav->sav_vdevs[i]); +} - /* - * Try to open all vdevs, loading each label in the process. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_open(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - goto out; +void +spa_claim_notify(zio_t *zio) +{ + spa_t *spa = zio->io_spa; - /* - * We need to validate the vdev labels against the configuration that - * we have in hand, which is dependent on the setting of mosconfig. If - * mosconfig is true then we're validating the vdev labels based on - * that config. Otherwise, we're validating against the cached config - * (zpool.cache) that was read when we loaded the zfs module, and then - * later we will recursively call spa_load() and validate against - * the vdev config. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - goto out; + if (zio->io_error) + return; - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; - } + mutex_enter(&spa->spa_props_lock); /* any mutex will do */ + if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) + spa->spa_claim_max_txg = zio->io_bp->blk_birth; + mutex_exit(&spa->spa_props_lock); +} + +typedef struct spa_load_error { + uint64_t sle_meta_count; + uint64_t sle_data_count; +} spa_load_error_t; + +static void +spa_load_verify_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + spa_load_error_t *sle = zio->io_private; + dmu_object_type_t type = BP_GET_TYPE(bp); + int error = zio->io_error; + + if (error) { + if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + type != DMU_OT_INTENT_LOG) + atomic_add_64(&sle->sle_meta_count, 1); + else + atomic_add_64(&sle->sle_data_count, 1); + } + zio_data_buf_free(zio->io_data, zio->io_size); +} + +/*ARGSUSED*/ +static int +spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + if (bp != NULL) { + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); + void *data = zio_data_buf_alloc(size); + + zio_nowait(zio_read(rio, spa, bp, data, size, + spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); + } + return (0); +} + +static int +spa_load_verify(spa_t *spa) +{ + zio_t *rio; + spa_load_error_t sle = { 0 }; + zpool_rewind_policy_t policy; + boolean_t verify_ok = B_FALSE; + int error; + + zpool_get_rewind_policy(spa->spa_config, &policy); + + if (policy.zrp_request & ZPOOL_NEVER_REWIND) + return (0); + + rio = zio_root(spa, NULL, &sle, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + error = traverse_pool(spa, spa->spa_verify_min_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); + + (void) zio_wait(rio); + + spa->spa_load_meta_errors = sle.sle_meta_count; + spa->spa_load_data_errors = sle.sle_data_count; + + if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && + sle.sle_data_count <= policy.zrp_maxdata) { + int64_t loss = 0; + + verify_ok = B_TRUE; + spa->spa_load_txg = spa->spa_uberblock.ub_txg; + spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); + VERIFY(nvlist_add_int64(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); + } else { + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; + } + + if (error) { + if (error != ENXIO && error != EIO) + error = EIO; + return (error); + } + + return (verify_ok ? 0 : EIO); +} + +/* + * Find a value in the pool props object. + */ +static void +spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) +{ + (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, + zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); +} + +/* + * Find a value in the pool directory object. + */ +static int +spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) +{ + return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val)); +} + +static int +spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) +{ + vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); + return (err); +} + +/* + * Fix up config after a partly-completed split. This is done with the + * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off + * pool have that entry in their config, but only the splitting one contains + * a list of all the guids of the vdevs that are being split off. + * + * This function determines what to do with that list: either rejoin + * all the disks to the pool, or complete the splitting process. To attempt + * the rejoin, each disk that is offlined is marked online again, and + * we do a reopen() call. If the vdev label for every disk that was + * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) + * then we call vdev_split() on each disk, and complete the split. + * + * Otherwise we leave the config alone, with all the vdevs in place in + * the original pool. + */ +static void +spa_try_repair(spa_t *spa, nvlist_t *config) +{ + uint_t extracted; + uint64_t *glist; + uint_t i, gcount; + nvlist_t *nvl; + vdev_t **vd; + boolean_t attempt_reopen; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) + return; + + /* check that the config is complete */ + if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + &glist, &gcount) != 0) + return; + + vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); + + /* attempt to online all the vdevs & validate */ + attempt_reopen = B_TRUE; + for (i = 0; i < gcount; i++) { + if (glist[i] == 0) /* vdev is hole */ + continue; + + vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); + if (vd[i] == NULL) { + /* + * Don't bother attempting to reopen the disks; + * just do the split. + */ + attempt_reopen = B_FALSE; + } else { + /* attempt to re-online it */ + vd[i]->vdev_offline = B_FALSE; + } + } + + if (attempt_reopen) { + vdev_reopen(spa->spa_root_vdev); + + /* check each device to see what state it's in */ + for (extracted = 0, i = 0; i < gcount; i++) { + if (vd[i] != NULL && + vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) + break; + ++extracted; + } + } + + /* + * If every disk has been moved to the new pool, or if we never + * even attempted to look at them, then we split them off for + * good. + */ + if (!attempt_reopen || gcount == extracted) { + for (i = 0; i < gcount; i++) + if (vd[i] != NULL) + vdev_split(vd[i]); + vdev_reopen(spa->spa_root_vdev); + } + + kmem_free(vd, gcount * sizeof (vdev_t *)); +} + +static int +spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, + boolean_t mosconfig) +{ + nvlist_t *config = spa->spa_config; + char *ereport = FM_EREPORT_ZFS_POOL; + int error; + uint64_t pool_guid; + nvlist_t *nvl; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) + return (EINVAL); + + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { + error = EEXIST; + } else { + spa->spa_load_guid = pool_guid; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, + &nvl) == 0) { + VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, + KM_SLEEP) == 0); + } + + gethrestime(&spa->spa_loaded_ts); + error = spa_load_impl(spa, pool_guid, config, state, type, + mosconfig, &ereport); + } + + spa->spa_minref = refcount_count(&spa->spa_refcount); + if (error) { + if (error != EEXIST) { + spa->spa_loaded_ts.tv_sec = 0; + spa->spa_loaded_ts.tv_nsec = 0; + } + if (error != EBADF) { + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + } + } + spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); +} + +/* + * Load an existing storage pool, using the pool's builtin spa_config as a + * source of configuration information. + */ +static int +spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + char **ereport) +{ + int error = 0; + nvlist_t *nvroot = NULL; + vdev_t *rvd; + uberblock_t *ub = &spa->spa_uberblock; + uint64_t children, config_cache_txg = spa->spa_config_txg; + int orig_mode = spa->spa_mode; + int parse; + uint64_t obj; + + /* + * If this is an untrusted config, access the pool in read-only mode. + * This prevents things like resilvering recently removed devices. + */ + if (!mosconfig) + spa->spa_mode = FREAD; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa->spa_load_state = state; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) + return (EINVAL); + + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + return (error); + + ASSERT(spa->spa_root_vdev == rvd); + + if (type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_guid(spa) == pool_guid); + } + + /* + * Try to open all vdevs, loading each label in the process. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error != 0) + return (error); + + /* + * We need to validate the vdev labels against the configuration that + * we have in hand, which is dependent on the setting of mosconfig. If + * mosconfig is true then we're validating the vdev labels based on + * that config. Otherwise, we're validating against the cached config + * (zpool.cache) that was read when we loaded the zfs module, and then + * later we will recursively call spa_load() and validate against + * the vdev config. + * + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + return (error); + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + } /* * Find the best uberblock. @@ -1268,32 +1894,33 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * If we weren't able to find a single valid uberblock, return failure. */ - if (ub->ub_txg == 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = ENXIO; - goto out; - } + if (ub->ub_txg == 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); /* * If the pool is newer than the code, we can't open it. */ - if (ub->ub_version > SPA_VERSION) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_VERSION_NEWER); - error = ENOTSUP; - goto out; - } + if (ub->ub_version > SPA_VERSION) + return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); /* * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. + * incomplete configuration. We first check to see if the pool + * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). + * If it is, defer the vdev_guid_sum check till later so we + * can handle missing vdevs. */ - if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_GUID_SUM); - error = ENXIO; - goto out; + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && + rvd->vdev_guid_sum != ub->ub_guid_sum) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_try_repair(spa, config); + spa_config_exit(spa, SCL_ALL, FTAG); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; } /* @@ -1301,221 +1928,174 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) */ spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; - spa->spa_first_txg = spa_last_synced_txg(spa) + 1; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; + spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; + error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); - if (error) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - goto out; - } + if (error) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { - nvlist_t *newconfig; uint64_t hostid; + nvlist_t *policy = NULL, *nvconfig; - if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, + if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; - VERIFY(nvlist_lookup_string(newconfig, + VERIFY(nvlist_lookup_string(nvconfig, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); +#ifdef _KERNEL + myhostid = zone_get_hostid(NULL); +#else /* _KERNEL */ + /* + * We're emulating the system's hostid in userland, so + * we can't use zone_get_hostid(). + */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); +#endif /* _KERNEL */ if (check_hostid && hostid != 0 && myhostid != 0 && - (unsigned long)hostid != myhostid) { + hostid != myhostid) { + nvlist_free(nvconfig); cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " "See: http://www.sun.com/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); - error = EBADF; - goto out; + return (EBADF); } } + if (nvlist_lookup_nvlist(spa->spa_config, + ZPOOL_REWIND_POLICY, &policy) == 0) + VERIFY(nvlist_add_nvlist(nvconfig, + ZPOOL_REWIND_POLICY, policy) == 0); - spa_config_set(spa, newconfig); + spa_config_set(spa, nvconfig); spa_unload(spa); spa_deactivate(spa); spa_activate(spa, orig_mode); - return (spa_load(spa, newconfig, state, B_TRUE)); + return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, + &spa->spa_creation_version); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, - sizeof (uint64_t), 1, &spa->spa_errlog_last); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, - sizeof (uint64_t), 1, &spa->spa_errlog_scrub); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, + &spa->spa_errlog_scrub); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, - sizeof (uint64_t), 1, &spa->spa_history); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * If we're assembling the pool from the split-off vdevs of + * an existing pool, we don't want to attach the spares & cache + * devices. + */ /* * Load any hot spares for this pool. */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - if (error == 0) { + error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, - &spa->spa_spares.sav_config) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + &spa->spa_spares.sav_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_L2CACHE, sizeof (uint64_t), 1, + error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - if (error == 0) { + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, - &spa->spa_l2cache.sav_config) != 0) { - vdev_set_state(rvd, B_TRUE, - VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + &spa->spa_l2cache.sav_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_l2cache.sav_sync = B_TRUE; } - spa_load_log_state(spa); - - if (spa_check_logs(spa)) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LOG); - error = ENXIO; - ereport = FM_EREPORT_ZFS_LOG_REPLAY; - goto out; - } - - spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); - - if (error && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); + if (error && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_BOOTFS), - sizeof (uint64_t), 1, &spa->spa_bootfs); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), - sizeof (uint64_t), 1, &autoreplace); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_DELEGATION), - sizeof (uint64_t), 1, &spa->spa_delegation); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), - sizeof (uint64_t), 1, &spa->spa_failmode); + uint64_t autoreplace; + + spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); + spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); + spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); + spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); + spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, + &spa->spa_dedup_ditto); + + spa->spa_autoreplace = (autoreplace != 0); } /* @@ -1525,8 +2105,18 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * unopenable vdevs so that the normal autoreplace handler can take * over. */ - if (autoreplace && state != SPA_LOAD_TRYIMPORT) + if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); + /* + * For the import case, this is done in spa_import(), because + * at this point we're using the spare definitions from + * the MOS config, not necessarily from the userland config. + */ + if (state != SPA_LOAD_IMPORT) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + } /* * Load the vdev state for all toplevel vdevs. @@ -1541,15 +2131,60 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa_config_exit(spa, SCL_ALL, FTAG); /* - * Check the state of the root vdev. If it can't be opened, it - * indicates one or more toplevel vdevs are faulted. + * Load the DDTs (dedup tables). */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; + error = ddt_load(spa); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + spa_update_dspace(spa); + + /* + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + nvlist_t *nvconfig; + + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (!spa_config_valid(spa, nvconfig)) { + nvlist_free(nvconfig); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + nvlist_free(nvconfig); + + /* + * Now that we've validate the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + + if (spa_check_logs(spa)) { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); + } } - if (spa_writeable(spa)) { + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (state != SPA_LOAD_TRYIMPORT) { + if (error = spa_load_verify(spa)) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || + spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; @@ -1558,31 +2193,44 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. + * Note: spa_claim_max_txg is updated by spa_claim_notify(), + * invoked from zil_claim_log_block()'s i/o done callback. + * Price of rollback is that we abandon the log. */ + spa->spa_claiming = B_TRUE; + tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); (void) dmu_objset_find(spa_name(spa), zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); - spa->spa_log_state = SPA_LOG_GOOD; + spa->spa_claiming = B_FALSE; + + spa_set_log_state(spa, SPA_LOG_GOOD); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* - * Wait for all claims to sync. + * Wait for all claims to sync. We sync up to the highest + * claimed log block birth time so that claimed log blocks + * don't appear to be from the future. spa_claim_max_txg + * will have been set for us by either zil_check_log_chain() + * (invoked from spa_check_logs()) or zil_claim() above. */ - txg_wait_synced(spa->spa_dsl_pool, 0); + txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * - * If spa_load_verbatim is true, trust the current + * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) + state == SPA_LOAD_IMPORT || + state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) @@ -1599,19 +2247,100 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * Check all DTLs to see if anything needs resilvering. */ - if (vdev_resilver_needed(rvd, NULL, NULL)) + if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); + + /* + * Delete any inconsistent datasets. + */ + (void) dmu_objset_find(spa_name(spa), + dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); + + /* + * Clean up any stale temporary dataset userrefs. + */ + dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); } - error = 0; -out: - spa->spa_minref = refcount_count(&spa->spa_refcount); - if (error && error != EBADF) - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); - spa->spa_load_state = SPA_LOAD_NONE; - spa->spa_ena = 0; + return (0); +} - return (error); +static int +spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) +{ + int mode = spa->spa_mode; + + spa_unload(spa); + spa_deactivate(spa); + + spa->spa_load_max_txg--; + + spa_activate(spa, mode); + spa_async_suspend(spa); + + return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); +} + +static int +spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, + uint64_t max_request, int rewind_flags) +{ + nvlist_t *config = NULL; + int load_error, rewind_error; + uint64_t safe_rewind_txg; + uint64_t min_txg; + + if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { + spa->spa_load_max_txg = spa->spa_load_txg; + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + spa->spa_load_max_txg = max_request; + } + + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, + mosconfig); + if (load_error == 0) + return (0); + + if (spa->spa_root_vdev != NULL) + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; + spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; + + if (rewind_flags & ZPOOL_NEVER_REWIND) { + nvlist_free(config); + return (load_error); + } + + /* Price of rolling back is discarding txgs, including log */ + if (state == SPA_LOAD_RECOVER) + spa_set_log_state(spa, SPA_LOG_CLEAR); + + spa->spa_load_max_txg = spa->spa_last_ubsync_txg; + safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; + min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? + TXG_INITIAL : safe_rewind_txg; + + /* + * Continue as long as we're finding errors, we're still within + * the acceptable rewind range, and we're still finding uberblocks + */ + while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && + spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { + if (spa->spa_load_max_txg < safe_rewind_txg) + spa->spa_extreme_rewind = B_TRUE; + rewind_error = spa_load_retry(spa, state, mosconfig); + } + + spa->spa_extreme_rewind = B_FALSE; + spa->spa_load_max_txg = UINT64_MAX; + + if (config && (rewind_error || state != SPA_LOAD_RECOVER)) + spa_config_set(spa, config); + + return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); } /* @@ -1627,11 +2356,14 @@ out: * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, + nvlist_t **config) { spa_t *spa; + spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; + int firstopen = B_FALSE; *spapp = NULL; @@ -1651,11 +2383,24 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) mutex_exit(&spa_namespace_lock); return (ENOENT); } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + zpool_rewind_policy_t policy; + + firstopen = B_TRUE; + + zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, + &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); - error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + + error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, + policy.zrp_request); if (error == EBADF) { /* @@ -1680,38 +2425,66 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_root_vdev != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, - B_TRUE); + if (config != NULL && spa->spa_config) { + VERIFY(nvlist_dup(spa->spa_config, config, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist(*config, + ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } spa_unload(spa); spa_deactivate(spa); - spa->spa_last_open_failed = B_TRUE; + spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); - } else { - spa->spa_last_open_failed = B_FALSE; } } spa_open_ref(spa, tag); - if (locked) + if (config != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + /* + * If we've recovered the pool, pass back any information we + * gathered while doing the load. + */ + if (state == SPA_LOAD_RECOVER) { + VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + + if (locked) { + spa->spa_last_open_failed = 0; + spa->spa_last_ubsync_txg = 0; + spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); +#ifdef __FreeBSD__ +#ifdef _KERNEL + if (firstopen) + zvol_create_minors(pool); +#endif +#endif + } *spapp = spa; - if (config != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - return (0); } int +spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, + nvlist_t **config) +{ + return (spa_open_common(name, spapp, tag, policy, config)); +} + +int spa_open(const char *name, spa_t **spapp, void *tag) { - return (spa_open_common(name, spapp, tag, NULL)); + return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* @@ -1782,7 +2555,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config) if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( - spares[i], ZPOOL_CONFIG_STATS, + spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; @@ -1839,7 +2612,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], - ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); vdev_get_stats(vd, vs); } } @@ -1852,7 +2626,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_t *spa; *config = NULL; - error = spa_open_common(name, &spa, FTAG, config); + error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* @@ -1863,6 +2637,13 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { + uint64_t loadtimes[2]; + + loadtimes[0] = spa->spa_loaded_ts.tv_sec; + loadtimes[1] = spa->spa_loaded_ts.tv_nsec; + VERIFY(nvlist_add_uint64_array(*config, + ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); + VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); @@ -2092,11 +2873,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; - int c, error = 0; + int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version; + uint64_t version, obj; /* * If this pool already exists, return failure. @@ -2112,11 +2893,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); + spa = spa_add(pool, NULL, altroot); spa_activate(spa, spa_mode_global); - spa->spa_uberblock.ub_txg = txg - 1; - if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); @@ -2128,6 +2907,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, &version) != 0) version = SPA_VERSION; ASSERT(version <= SPA_VERSION); + + spa->spa_first_txg = txg; + spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; @@ -2154,9 +2936,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_init(rvd->vdev_child[c], txg); - vdev_config_dirty(rvd); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_metaslab_set_size(rvd->vdev_child[c]); + vdev_expand(rvd->vdev_child[c], txg); + } } spa_config_exit(spa, SCL_ALL, FTAG); @@ -2202,6 +2985,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; + /* + * Create DDTs (dedup tables). + */ + ddt_create(spa); + + spa_update_dspace(spa); + tx = dmu_tx_create_assigned(dp, txg); /* @@ -2217,6 +3007,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, + sizeof (uint64_t), 1, &version, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool version"); + } + /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; @@ -2228,20 +3024,20 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, } /* - * Create the deferred-free bplist object. Turn off compression + * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ - spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, - 1 << 14, tx); - dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, + obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { - cmn_err(CE_PANIC, "failed to add bplist"); + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, + sizeof (uint64_t), 1, &obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bpobj"); } + VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, + spa->spa_meta_objset, obj)); /* * Create the pool's history object. @@ -2255,9 +3051,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); + spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); + if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(spa, props, CRED(), tx); + spa_sync_props(spa, props, tx); } dmu_tx_commit(tx); @@ -2275,6 +3073,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); + spa_history_log_version(spa, LOG_POOL_CREATE); spa->spa_minref = refcount_count(&spa->spa_refcount); @@ -2283,32 +3082,39 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (0); } -#ifdef sun +#if defined(sun) #ifdef _KERNEL /* - * Build a "root" vdev for a top level vdev read in from a rootpool - * device label. + * Get the root pool information from the root disk, then import the root pool + * during the system boot up time. */ -static void -spa_build_rootpool_config(nvlist_t *config) +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); + +static nvlist_t * +spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { + nvlist_t *config; nvlist_t *nvtop, *nvroot; uint64_t pgid; + if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) + return (NULL); + /* * Add this top-level vdev to the child array. */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) - == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) - == 0); + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) - == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, @@ -2320,127 +3126,40 @@ spa_build_rootpool_config(nvlist_t *config) */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + return (config); } /* - * Get the root pool information from the root disk, then import the root pool - * during the system boot up time. - */ -extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); - -int -spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, - uint64_t *besttxg) -{ - nvlist_t *config; - uint64_t txg; - int error; - - if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) - return (error); - - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); - - if (bestconf != NULL) - *bestconf = config; - else - nvlist_free(config); - *besttxg = txg; - return (0); -} - -boolean_t -spa_rootdev_validate(nvlist_t *nv) -{ - uint64_t ival; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) - return (B_FALSE); - - return (B_TRUE); -} - - -/* - * Given the boot device's physical path or devid, check if the device - * is in a valid state. If so, return the configuration from the vdev - * label. + * Walk the vdev tree and see if we can find a device with "better" + * configuration. A configuration is "better" if the label on that + * device has a more recent txg. */ -int -spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) +static void +spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { - nvlist_t *conf = NULL; - uint64_t txg = 0; - nvlist_t *nvtop, **child; - char *type; - char *bootpath = NULL; - uint_t children, c; - char *tmp; - int error; - - if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) - *tmp = '\0'; - if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { - cmn_err(CE_NOTE, "error reading device label"); - return (error); - } - if (txg == 0) { - cmn_err(CE_NOTE, "this device is detached"); - nvlist_free(conf); - return (EINVAL); - } + for (int c = 0; c < vd->vdev_children; c++) + spa_alt_rootvdev(vd->vdev_child[c], avd, txg); - VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); + if (vd->vdev_ops->vdev_op_leaf) { + nvlist_t *label; + uint64_t label_txg; - if (strcmp(type, VDEV_TYPE_DISK) == 0) { - if (spa_rootdev_validate(nvtop)) { - goto out; - } else { - nvlist_free(conf); - return (EINVAL); - } - } - - ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); + if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, + &label) != 0) + return; - VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); + VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &label_txg) == 0); - /* - * Go thru vdevs in the mirror to see if the given device - * has the most recent txg. Only the device with the most - * recent txg has valid information and should be booted. - */ - for (c = 0; c < children; c++) { - char *cdevid, *cpath; - uint64_t tmptxg; - - cpath = NULL; - cdevid = NULL; - if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, - &cpath) != 0 && nvlist_lookup_string(child[c], - ZPOOL_CONFIG_DEVID, &cdevid) != 0) - return (EINVAL); - if ((spa_check_rootconf(cpath, cdevid, NULL, - &tmptxg) == 0) && (tmptxg > txg)) { - txg = tmptxg; - VERIFY(nvlist_lookup_string(child[c], - ZPOOL_CONFIG_PATH, &bootpath) == 0); + /* + * Do we have a better boot device? + */ + if (label_txg > *txg) { + *txg = label_txg; + *avd = vd; } + nvlist_free(label); } - - /* Does the best device match the one we've booted from? */ - if (bootpath) { - cmn_err(CE_NOTE, "try booting from '%s'", bootpath); - return (EINVAL); - } -out: - *bestconf = conf; - return (0); } /* @@ -2458,24 +3177,35 @@ out: int spa_import_rootpool(char *devpath, char *devid) { - nvlist_t *conf = NULL; + spa_t *spa; + vdev_t *rvd, *bvd, *avd = NULL; + nvlist_t *config, *nvtop; + uint64_t guid, txg; char *pname; int error; - spa_t *spa; /* - * Get the vdev pathname and configuation from the most - * recently updated vdev (highest txg). + * Read the label from the boot device and generate a configuration. */ - if (error = spa_get_rootconf(devpath, devid, &conf)) - goto msg_out; - - /* - * Add type "root" vdev to the config. - */ - spa_build_rootpool_config(conf); + config = spa_generate_rootconf(devpath, devid, &guid); +#if defined(_OBP) && defined(_KERNEL) + if (config == NULL) { + if (strstr(devpath, "/iscsi/ssd") != NULL) { + /* iscsi boot */ + get_iscsi_bootpath_phy(devpath); + config = spa_generate_rootconf(devpath, devid, &guid); + } + } +#endif + if (config == NULL) { + cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + devpath); + return (EIO); + } - VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pname)) != NULL) { @@ -2486,71 +3216,90 @@ spa_import_rootpool(char *devpath, char *devid) spa_remove(spa); } - spa = spa_add(pname, NULL); + spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; - spa->spa_load_verbatim = B_TRUE; - - VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0); - mutex_exit(&spa_namespace_lock); - - nvlist_free(conf); - return (0); + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; -msg_out: - cmn_err(CE_NOTE, "\n" - " *************************************************** \n" - " * This device is not bootable! * \n" - " * It is either offlined or detached or faulted. * \n" - " * Please try to boot from a different device. * \n" - " *************************************************** "); - - return (error); -} -#endif -#endif /* sun */ - -/* - * Take a pool and insert it into the namespace as if it had been loaded at - * boot. - */ -int -spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) -{ - spa_t *spa; - char *altroot = NULL; - - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { mutex_exit(&spa_namespace_lock); - return (EEXIST); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); } - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); - - spa->spa_load_verbatim = B_TRUE; - - VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); - - if (props != NULL) - spa_configfile_set(spa, props, B_FALSE); + /* + * Get the boot vdev. + */ + if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { + cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", + (u_longlong_t)guid); + error = ENOENT; + goto out; + } - spa_config_sync(spa, B_FALSE, B_TRUE); + /* + * Determine if there is a better boot device. + */ + avd = bvd; + spa_alt_rootvdev(rvd, &avd, &txg); + if (avd != bvd) { + cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " + "try booting from '%s'", avd->vdev_path); + error = EINVAL; + goto out; + } + + /* + * If the boot device is part of a spare vdev then ensure that + * we're booting off the active spare. + */ + if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && + !bvd->vdev_isspare) { + cmn_err(CE_NOTE, "The boot device is currently spared. Please " + "try booting from '%s'", + bvd->vdev_parent-> + vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); + error = EINVAL; + goto out; + } + error = 0; + spa_history_log_version(spa, LOG_POOL_IMPORT); +out: + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); - return (0); + nvlist_free(config); + return (error); } +#endif +#endif /* sun */ + /* * Import a non-root pool into the system. */ int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; + spa_load_state_t state = SPA_LOAD_IMPORT; + zpool_rewind_policy_t policy; + uint64_t mode = spa_mode_global; + uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; @@ -2560,7 +3309,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { + if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (EEXIST); } @@ -2570,20 +3319,57 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); - spa_activate(spa, spa_mode_global); + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); + if (readonly) + mode = FREAD; + spa = spa_add(pool, config, altroot); + spa->spa_import_flags = flags; + + /* + * Verbatim import - Take a pool and insert it into the namespace + * as if it had been loaded at boot. + */ + if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_config_sync(spa, B_FALSE, B_TRUE); + + mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); + + return (0); + } + + spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when * doing an import. */ - error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + + error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, + policy.zrp_request); + + /* + * Propagate anything learned while loading the pool and pass it + * back to caller (i.e. rewind info, missing devices, etc). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -2660,6 +3446,14 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa->spa_l2cache.sav_sync = B_TRUE; } + /* + * Check for any removed devices. + */ + if (spa->spa_autoreplace) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. @@ -2667,17 +3461,23 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } + /* + * It's possible that the pool was expanded while it was exported. + * We kick off an async task to handle this for us. + */ + spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); + mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); +#ifdef __FreeBSD__ +#ifdef _KERNEL + zvol_create_minors(pool); +#endif +#endif return (0); } -/* - * This (illegal) pool name is used when temporarily importing a spa_t in order - * to get the vdev stats associated with the imported devices. - */ -#define TRYIMPORT_NAME "$import" - nvlist_t * spa_tryimport(nvlist_t *tryconfig) { @@ -2697,7 +3497,7 @@ spa_tryimport(nvlist_t *tryconfig) * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, NULL); + spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* @@ -2705,7 +3505,7 @@ spa_tryimport(nvlist_t *tryconfig) * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ - error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. @@ -2850,7 +3650,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; - spa->spa_final_txg = spa_last_synced_txg(spa) + 1; + spa->spa_final_txg = spa_last_synced_txg(spa) + + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } @@ -2920,13 +3721,15 @@ spa_reset(char *pool) int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg; + uint64_t txg, id; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, @@ -2961,9 +3764,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * Transfer each new top-level vdev from vd to rvd. */ for (int c = 0; c < vd->vdev_children; c++) { + + /* + * Set the vdev id to the first hole, if one exists. + */ + for (id = 0; id < rvd->vdev_children; id++) { + if (rvd->vdev_child[id]->vdev_ishole) { + vdev_free(rvd->vdev_child[id]); + break; + } + } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); - tvd->vdev_id = rvd->vdev_children; + tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } @@ -3020,15 +3833,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { - uint64_t txg, open_txg; + uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; - dmu_tx_t *tx; char *oldvdpath, *newvdpath; int newvd_isspare; int error; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3078,7 +3892,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && - pvd->vdev_child[1] == oldvd && + oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -3090,23 +3904,24 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * the same (spare replaces spare, non-spare replaces * non-spare). */ - if (pvd->vdev_ops == &vdev_replacing_ops) + if (pvd->vdev_ops == &vdev_replacing_ops && + spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops == &vdev_spare_ops && - newvd->vdev_isspare != oldvd->vdev_isspare) + } else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops != &vdev_spare_ops && - newvd->vdev_isspare) + } + + if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* - * Compare the new device size with the replaceable/attachable - * device size. + * Make sure the new device is big enough. */ - if (newvd->vdev_psize < vdev_get_rsize(oldvd)) + if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -3132,6 +3947,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } } + /* mark the device being resilvered */ + newvd->vdev_resilvering = B_TRUE; + /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -3148,14 +3966,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; + newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); - /* - * If newvd is smaller than oldvd, but larger than its rsize, - * the addition of newvd may have decreased our parent's asize. - */ - pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); - tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); @@ -3163,13 +3976,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_config_dirty(tvd); /* - * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate - * upward when spa_vdev_exit() calls vdev_dtl_reassess(). + * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account + * for any dmu_sync-ed blocks. It will propagate upward when + * spa_vdev_exit() calls vdev_dtl_reassess(). */ - open_txg = txg + TXG_CONCURRENT_STATES - 1; + dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, open_txg - TXG_INITIAL + 1); + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -3185,27 +3999,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_dirty(tvd, VDD_DTL, newvd, txg); - (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + /* + * Restart the resilver + */ + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + + /* + * Commit the config + */ + (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - if (dmu_tx_assign(tx, TXG_WAIT) == 0) { - spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, - CRED(), "%s vdev=%s %s vdev=%s", - replacing && newvd_isspare ? "spare in" : - replacing ? "replace" : "attach", newvdpath, - replacing ? "for" : "to", oldvdpath); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, + "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); - /* - * Kick off a resilver to update newvd. - */ - VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); return (0); } @@ -3224,7 +4038,9 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid; - size_t len; + char *vdpath; + + ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); @@ -3255,18 +4071,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* - * If replace_done is specified, only remove this device if it's - * the first child of a replacing vdev. For the 'spare' vdev, either - * disk can be removed. + * Only 'replacing' or 'spare' vdevs can be replaced. */ - if (replace_done) { - if (pvd->vdev_ops == &vdev_replacing_ops) { - if (vd->vdev_id != 0) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } else if (pvd->vdev_ops != &vdev_spare_ops) { - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } - } + if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); @@ -3293,16 +4102,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ - if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && - pvd->vdev_child[0]->vdev_path != NULL && - pvd->vdev_child[1]->vdev_path != NULL) { - ASSERT(pvd->vdev_child[1] == vd); - cvd = pvd->vdev_child[0]; - len = strlen(vd->vdev_path); - if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && - strcmp(cvd->vdev_path + len, "/old") == 0) { - spa_strfree(cvd->vdev_path); - cvd->vdev_path = spa_strdup(vd->vdev_path); + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && + vd->vdev_path != NULL) { + size_t len = strlen(vd->vdev_path); + + for (int c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + + if (cvd == vd || cvd->vdev_path == NULL) + continue; + + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + break; + } } } @@ -3312,7 +4127,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) + vd->vdev_id == 0 && + pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* @@ -3334,7 +4150,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) /* * Remember one of the remaining children so we can get tvd below. */ - cvd = pvd->vdev_child[0]; + cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, @@ -3350,75 +4166,390 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + cvd->vdev_unspare = B_TRUE; + } + + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) { + if (pvd->vdev_ops == &vdev_spare_ops) + cvd->vdev_unspare = B_FALSE; + vdev_remove_parent(cvd); + cvd->vdev_resilvering = B_FALSE; + } + + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd); + + /* + * If the 'autoexpand' property is set on the pool then automatically + * try to expand the size of the pool. For example if the device we + * just detached was smaller than the others, it may be possible to + * add metaslabs (i.e. grow the pool). We need to reopen the vdev + * first so that we can obtain the updated sizes of the leaf vdevs. + */ + if (spa->spa_autoexpand) { + vdev_reopen(tvd); + vdev_expand(tvd, txg); + } + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + vdpath = spa_strdup(vd->vdev_path); + for (int t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + + /* hang on to the spa before we release the lock */ + spa_open_ref(spa, FTAG); + + error = spa_vdev_exit(spa, vd, txg, 0); + + spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, + "vdev=%s", vdpath); + spa_strfree(vdpath); + + /* + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. + */ + if (unspare) { + spa_t *altspa = NULL; + + mutex_enter(&spa_namespace_lock); + while ((altspa = spa_next(altspa)) != NULL) { + if (altspa->spa_state != POOL_STATE_ACTIVE || + altspa == spa) + continue; + + spa_open_ref(altspa, FTAG); + mutex_exit(&spa_namespace_lock); + (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); + mutex_enter(&spa_namespace_lock); + spa_close(altspa, FTAG); + } + mutex_exit(&spa_namespace_lock); + + /* search the rest of the vdevs for spares to remove */ + spa_vdev_resilver_done(spa); + } + + /* all done with the spa; OK to release */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + + return (error); +} + +/* + * Split a set of devices from their mirrors, and create a new pool from them. + */ +int +spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp) +{ + int error = 0; + uint64_t txg, *glist; + spa_t *newspa; + uint_t c, children, lastlog; + nvlist_t **child, *nvl, *tmp; + dmu_tx_t *tx; + char *altroot = NULL; + vdev_t *rvd, **vml = NULL; /* vdev modify list */ + boolean_t activate_slog; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + /* clear the log and flush everything up to now */ + activate_slog = spa_passivate_log(spa); + (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + error = spa_offline_log(spa); + txg = spa_vdev_config_enter(spa); + + if (activate_slog) + spa_activate_log(spa); + + if (error != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + /* check new spa name before going any further */ + if (spa_lookup(newname) != NULL) + return (spa_vdev_exit(spa, NULL, txg, EEXIST)); + + /* + * scan through all the children to ensure they're all mirrors + */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || + nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* first, check to ensure we've got the right child count */ + rvd = spa->spa_root_vdev; + lastlog = 0; + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + /* don't count the holes & logs as children */ + if (vd->vdev_islog || vd->vdev_ishole) { + if (lastlog == 0) + lastlog = c; + continue; + } + + lastlog = 0; + } + if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* next, ensure no spare or cache devices are part of the split */ + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || + nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); + glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); + + /* then, loop over each vdev and validate it */ + for (c = 0; c < children; c++) { + uint64_t is_hole = 0; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole != 0) { + if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || + spa->spa_root_vdev->vdev_child[c]->vdev_islog) { + continue; + } else { + error = EINVAL; + break; + } + } + + /* which disk is going to be split? */ + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &glist[c]) != 0) { + error = EINVAL; + break; + } + + /* look it up in the spa */ + vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); + if (vml[c] == NULL) { + error = ENODEV; + break; + } + + /* make sure there's nothing stopping the split */ + if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || + vml[c]->vdev_islog || + vml[c]->vdev_ishole || + vml[c]->vdev_isspare || + vml[c]->vdev_isl2cache || + !vdev_writeable(vml[c]) || + vml[c]->vdev_children != 0 || + vml[c]->vdev_state != VDEV_STATE_HEALTHY || + c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { + error = EINVAL; + break; + } + + if (vdev_dtl_required(vml[c])) { + error = EBUSY; + break; + } + + /* we need certain info from the top level */ + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, + vml[c]->vdev_top->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, + vml[c]->vdev_top->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, + vml[c]->vdev_top->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, + vml[c]->vdev_top->vdev_ashift) == 0); + } + + if (error != 0) { + kmem_free(vml, children * sizeof (vdev_t *)); + kmem_free(glist, children * sizeof (uint64_t)); + return (spa_vdev_exit(spa, NULL, txg, error)); } - /* - * If the parent mirror/replacing vdev only has one child, - * the parent is no longer needed. Remove it from the tree. - */ - if (pvd->vdev_children == 1) - vdev_remove_parent(cvd); + /* stop writers from using the disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_TRUE; + } + vdev_reopen(spa->spa_root_vdev); + + /* + * Temporarily record the splitting vdevs in the spa config. This + * will disappear once the config is regenerated. + */ + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + glist, children) == 0); + kmem_free(glist, children * sizeof (uint64_t)); + + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, + nvl) == 0); + mutex_exit(&spa->spa_props_lock); + spa->spa_config_splitting = nvl; + vdev_config_dirty(spa->spa_root_vdev); + + /* configure and create the new pool */ + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + spa->spa_config_txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_generate_guid(NULL)) == 0); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + + /* add the new pool to the namespace */ + newspa = spa_add(newname, config, altroot); + newspa->spa_config_txg = spa->spa_config_txg; + spa_set_log_state(newspa, SPA_LOG_CLEAR); + + /* release the spa config lock, retaining the namespace lock */ + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 1); + + spa_activate(newspa, spa_mode_global); + spa_async_suspend(newspa); + +#ifndef sun + /* mark that we are creating new spa by splitting */ + newspa->spa_splitting_newspa = B_TRUE; +#endif + /* create the new pool from the disks of the original pool */ + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); +#ifndef sun + newspa->spa_splitting_newspa = B_FALSE; +#endif + if (error) + goto out; + + /* if that worked, generate a real config for the new pool */ + if (newspa->spa_root_vdev != NULL) { + VERIFY(nvlist_alloc(&newspa->spa_config_splitting, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, + B_TRUE)); + } + + /* set the props */ + if (props != NULL) { + spa_configfile_set(newspa, props, B_FALSE); + error = spa_prop_set(newspa, props); + if (error) + goto out; + } + + /* flush everything */ + txg = spa_vdev_config_enter(newspa); + vdev_config_dirty(newspa->spa_root_vdev); + (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 2); + + spa_async_resume(newspa); + + /* finally, update the original pool's config */ + txg = spa_vdev_config_enter(spa); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) + dmu_tx_abort(tx); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + vdev_split(vml[c]); + if (error == 0) + spa_history_log_internal(LOG_POOL_VDEV_DETACH, + spa, tx, "vdev=%s", + vml[c]->vdev_path); + vdev_free(vml[c]); + } + } + vdev_config_dirty(spa->spa_root_vdev); + spa->spa_config_splitting = NULL; + nvlist_free(nvl); + if (error == 0) + dmu_tx_commit(tx); + (void) spa_vdev_exit(spa, NULL, txg, 0); - /* - * We don't set tvd until now because the parent we just removed - * may have been the previous top-level vdev. - */ - tvd = cvd->vdev_top; - ASSERT(tvd->vdev_parent == rvd); + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 3); - /* - * Reevaluate the parent vdev state. - */ - vdev_propagate_state(cvd); + /* split is complete; log a history record */ + spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, + "split new pool %s from pool %s", newname, spa_name(spa)); - /* - * If the device we just detached was smaller than the others, it may be - * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() - * can't fail because the existing metaslabs are already in core, so - * there's nothing to read from disk. - */ - VERIFY(vdev_metaslab_init(tvd, txg) == 0); + kmem_free(vml, children * sizeof (vdev_t *)); - vdev_config_dirty(tvd); + /* if we're not going to mount the filesystems in userland, export */ + if (exp) + error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, + B_FALSE, B_FALSE); - /* - * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that - * vd->vdev_detached is set and free vd's DTL object in syncing context. - * But first make sure we're not on any *other* txg's DTL list, to - * prevent vd from being accessed after it's freed. - */ - for (int t = 0; t < TXG_SIZE; t++) - (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); - vd->vdev_detached = B_TRUE; - vdev_dirty(tvd, VDD_DTL, vd, txg); + return (error); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); +out: + spa_unload(newspa); + spa_deactivate(newspa); + spa_remove(newspa); - error = spa_vdev_exit(spa, vd, txg, 0); + txg = spa_vdev_config_enter(spa); - /* - * If this was the removal of the original device in a hot spare vdev, - * then we want to go through and remove the device from the hot spare - * list of every other pool. - */ - if (unspare) { - spa_t *myspa = spa; - spa = NULL; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa->spa_state != POOL_STATE_ACTIVE) - continue; - if (spa == myspa) - continue; - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - } - mutex_exit(&spa_namespace_lock); + /* re-online all offlined disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_FALSE; } + vdev_reopen(spa->spa_root_vdev); + + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + (void) spa_vdev_exit(spa, NULL, txg, error); + kmem_free(vml, children * sizeof (vdev_t *)); return (error); } @@ -3464,19 +4595,118 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, } /* + * Evacuate the device. + */ +static int +spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) +{ + uint64_t txg; + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + ASSERT(vd == vd->vdev_top); + + /* + * Evacuate the device. We don't hold the config lock as writer + * since we need to do I/O but we do keep the + * spa_namespace_lock held. Once this completes the device + * should no longer have any blocks allocated on it. + */ + if (vd->vdev_islog) { + if (vd->vdev_stat.vs_alloc != 0) + error = spa_offline_log(spa); + } else { + error = ENOTSUP; + } + + if (error) + return (error); + + /* + * The evacuation succeeded. Remove any remaining MOS metadata + * associated with this vdev, and wait for these changes to sync. + */ + ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); + txg = spa_vdev_config_enter(spa); + vd->vdev_removing = B_TRUE; + vdev_dirty(vd, 0, NULL, txg); + vdev_config_dirty(vd); + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + + return (0); +} + +/* + * Complete the removal by cleaning up the namespace. + */ +static void +spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t id = vd->vdev_id; + boolean_t last_vdev = (id == (rvd->vdev_children - 1)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(vd == vd->vdev_top); + + /* + * Only remove any devices which are empty. + */ + if (vd->vdev_stat.vs_alloc != 0) + return; + + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + if (list_link_active(&vd->vdev_state_dirty_node)) + vdev_state_clean(vd); + if (list_link_active(&vd->vdev_config_dirty_node)) + vdev_config_clean(vd); + + vdev_free(vd); + + if (last_vdev) { + vdev_compact_children(rvd); + } else { + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); + } + vdev_config_dirty(rvd); + + /* + * Reassess the health of our root vdev. + */ + vdev_reopen(rvd); +} + +/* + * Remove a device from the pool - + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ + +/* * Remove a device from the pool. Currently, this supports removing only hot - * spares and level 2 ARC devices. + * spares, slogs, and level 2 ARC devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; + metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; - uint_t nspares, nl2cache; uint64_t txg = 0; + uint_t nspares, nl2cache; int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + ASSERT(spa_writeable(spa)); + if (!locked) txg = spa_vdev_enter(spa); @@ -3509,6 +4739,49 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL && vd->vdev_islog) { + ASSERT(!locked); + ASSERT(vd == vd->vdev_top); + + /* + * XXX - Once we have bp-rewrite this should + * become the common case. + */ + + mg = vd->vdev_mg; + + /* + * Stop allocating from this vdev. + */ + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * Attempt to evacuate the vdev. + */ + error = spa_vdev_remove_evacuate(spa, vd); + + txg = spa_vdev_config_enter(spa); + + /* + * If we couldn't evacuate the vdev, unwind. + */ + if (error) { + metaslab_group_activate(mg); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* + * Clean up the vdev namespace. + */ + spa_vdev_remove_from_namespace(spa, vd); + } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). @@ -3535,22 +4808,29 @@ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; - int c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* - * Check for a completed replacement. + * Check for a completed replacement. We always consider the first + * vdev in the list to be the oldest vdev, and the last one to be + * the newest (see spa_vdev_attach() for how that works). In + * the case where the newest vdev is faulted, we will not automatically + * remove it after a resilver completes. This is OK as it will require + * user intervention to determine which disk the admin wishes to keep. */ - if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { + if (vd->vdev_ops == &vdev_replacing_ops) { + ASSERT(vd->vdev_children > 1); + + newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; - newvd = vd->vdev_child[1]; if (vdev_dtl_empty(newvd, DTL_MISSING) && + vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } @@ -3558,15 +4838,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) /* * Check for a completed resilver with the 'unspare' flag set. */ - if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { - newvd = vd->vdev_child[0]; - oldvd = vd->vdev_child[1]; + if (vd->vdev_ops == &vdev_spare_ops) { + vdev_t *first = vd->vdev_child[0]; + vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; + + if (last->vdev_unspare) { + oldvd = first; + newvd = last; + } else if (first->vdev_unspare) { + oldvd = last; + newvd = first; + } else { + oldvd = NULL; + } - if (newvd->vdev_unspare && + if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && - !vdev_dtl_required(oldvd)) { - newvd->vdev_unspare = 0; + vdev_dtl_empty(newvd, DTL_OUTAGE) && + !vdev_dtl_required(oldvd)) return (oldvd); + + /* + * If there are more than two spares attached to a disk, + * and those spares are not required, then we want to + * attempt to free them up now so that they can be used + * by other pools. Once we're back down to a single + * disk+spare, we stop removing them. + */ + if (vd->vdev_children > 2) { + newvd = vd->vdev_child[1]; + + if (newvd->vdev_isspare && last->vdev_isspare && + vdev_dtl_empty(last, DTL_MISSING) && + vdev_dtl_empty(last, DTL_OUTAGE) && + !vdev_dtl_required(newvd)) + return (newvd); } } @@ -3593,9 +4899,9 @@ spa_vdev_resilver_done(spa_t *spa) * we need to detach the parent's first child (the original hot * spare) as well. */ - if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && + ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(ppvd->vdev_children == 2); sguid = ppvd->vdev_child[1]->vdev_guid; } spa_config_exit(spa, SCL_ALL, FTAG); @@ -3610,36 +4916,43 @@ spa_vdev_resilver_done(spa_t *spa) } /* - * Update the stored path or FRU for this vdev. Dirty the vdev configuration, - * relying on spa_vdev_enter/exit() to synchronize the labels and cache. + * Update the stored path or FRU for this vdev. */ int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; - uint64_t txg; + boolean_t sync = B_FALSE; - txg = spa_vdev_enter(spa); + ASSERT(spa_writeable(spa)); + + spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { - spa_strfree(vd->vdev_path); - vd->vdev_path = spa_strdup(value); + if (strcmp(value, vd->vdev_path) != 0) { + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(value); + sync = B_TRUE; + } } else { - if (vd->vdev_fru != NULL) + if (vd->vdev_fru == NULL) { + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); - vd->vdev_fru = spa_strdup(value); + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } } - vdev_config_dirty(vd->vdev_top); - - return (spa_vdev_exit(spa, NULL, txg, 0)); + return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int @@ -3656,40 +4969,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) /* * ========================================================================== - * SPA Scrubbing + * SPA Scanning * ========================================================================== */ int -spa_scrub(spa_t *spa, pool_scrub_type_t type) +spa_scan_stop(spa_t *spa) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (EBUSY); + return (dsl_scan_cancel(spa->spa_dsl_pool)); +} + +int +spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - if ((uint_t)type >= POOL_SCRUB_TYPES) + if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (ENOTSUP); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ - if (type == POOL_SCRUB_RESILVER && + if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } - if (type == POOL_SCRUB_EVERYTHING && - spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && - spa->spa_dsl_pool->dp_scrub_isresilver) - return (EBUSY); - - if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { - return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); - } else if (type == POOL_SCRUB_NONE) { - return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); - } else { - return (EINVAL); - } + return (dsl_scan(spa->spa_dsl_pool, func)); } /* @@ -3702,7 +5013,8 @@ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { - vd->vdev_remove_wanted = 0; + vd->vdev_remove_wanted = B_FALSE; + vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* @@ -3726,7 +5038,7 @@ static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = 0; + vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } @@ -3735,6 +5047,37 @@ spa_async_probe(spa_t *spa, vdev_t *vd) } static void +spa_async_autoexpand(spa_t *spa, vdev_t *vd) +{ + sysevent_id_t eid; + nvlist_t *attr; + char *physpath; + + if (!spa->spa_autoexpand) + return; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + spa_async_autoexpand(spa, cvd); + } + + if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) + return; + + physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); + + VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); + + (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, + ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); + + nvlist_free(attr); + kmem_free(physpath, MAXPATHLEN); +} + +static void spa_async_thread(void *arg) { spa_t *spa = arg; @@ -3751,16 +5094,31 @@ spa_async_thread(void *arg) * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { + uint64_t old_space, new_space; + mutex_enter(&spa_namespace_lock); + old_space = metaslab_class_get_space(spa_normal_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + new_space = metaslab_class_get_space(spa_normal_class(spa)); mutex_exit(&spa_namespace_lock); + + /* + * If the pool grew as a result of the config update, + * then log an internal history event. + */ + if (new_space != old_space) { + spa_history_log_internal(LOG_POOL_VDEV_ONLINE, + spa, NULL, + "pool '%s' size: %llu(+%llu)", + spa_name(spa), new_space, new_space - old_space); + } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); @@ -3769,11 +5127,17 @@ spa_async_thread(void *arg) (void) spa_vdev_state_exit(spa, NULL, 0); } + if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_async_autoexpand(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -3788,7 +5152,7 @@ spa_async_thread(void *arg) * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); + dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. @@ -3834,6 +5198,7 @@ spa_async_dispatch(spa_t *spa) void spa_async_request(spa_t *spa, int task) { + zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); @@ -3845,37 +5210,22 @@ spa_async_request(spa_t *spa, int task) * ========================================================================== */ -static void -spa_sync_deferred_frees(spa_t *spa, uint64_t txg) +static int +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { - bplist_t *bpl = &spa->spa_sync_bplist; - dmu_tx_t *tx; - blkptr_t blk; - uint64_t itor = 0; - zio_t *zio; - int error; - uint8_t c = 1; - - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - - while (bplist_iterate(bpl, &itor, &blk) == 0) { - ASSERT(blk.blk_birth < txg); - zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED)); - } - - error = zio_wait(zio); - ASSERT3U(error, ==, 0); + bpobj_t *bpo = arg; + bpobj_enqueue(bpo, bp, tx); + return (0); +} - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - bplist_vacate(bpl, tx); +static int +spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zio_t *zio = arg; - /* - * Pre-dirty the first block so we sync to convergence faster. - * (Usually only the first block is needed.) - */ - dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); - dmu_tx_commit(tx); + zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, + zio->io_flags)); + return (0); } static void @@ -3942,7 +5292,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], - B_FALSE, B_FALSE, B_TRUE); + B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) @@ -3982,7 +5332,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) * Set zpool properties. */ static void -spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; @@ -4023,9 +5373,11 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ASSERT(spa->spa_root != NULL); break; + case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* - * 'cachefile' is also a non-persisitent property. + * 'readonly' and 'cachefile' are also non-persisitent + * properties. */ break; default: @@ -4033,8 +5385,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { - objset_t *mos = spa->spa_meta_objset; - VERIFY((spa->spa_pool_props_object = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx)) > 0); @@ -4081,6 +5431,15 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; + case ZPOOL_PROP_AUTOEXPAND: + spa->spa_autoexpand = intval; + if (tx->tx_txg != TXG_INITIAL) + spa_async_request(spa, + SPA_ASYNC_AUTOEXPAND); + break; + case ZPOOL_PROP_DEDUPDITTO: + spa->spa_dedup_ditto = intval; + break; default: break; } @@ -4089,8 +5448,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* log internal history if this is not a zpool create */ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && tx->tx_txg != TXG_INITIAL) { - spa_history_internal_log(LOG_POOL_PROPSET, - spa, tx, cr, "%s %lld %s", + spa_history_log_internal(LOG_POOL_PROPSET, + spa, tx, "%s %lld %s", nvpair_name(elem), intval, spa_name(spa)); } } @@ -4099,6 +5458,42 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } /* + * Perform one-time upgrade on-disk changes. spa_version() does not + * reflect the new version this txg, so there must be no changes this + * txg to anything that the upgrade code depends on after it executes. + * Therefore this must be called after dsl_pool_sync() does the sync + * tasks. + */ +static void +spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + + ASSERT(spa->spa_sync_pass == 1); + + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && + spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { + dsl_pool_create_origin(dp, tx); + + /* Keeping the origin open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { + dsl_pool_upgrade_clones(dp, tx); + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { + dsl_pool_upgrade_dir_clones(dp, tx); + + /* Keeping the freedir open increases spa_minref */ + spa->spa_minref += 3; + } +} + +/* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ @@ -4107,13 +5502,15 @@ spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; - bplist_t *bpl = &spa->spa_sync_bplist; + bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; + bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; - int dirty_vdevs; int error; + VERIFY(spa_writeable(spa)); + /* * Lock out configuration changes. */ @@ -4146,8 +5543,6 @@ spa_sync(spa_t *spa, uint64_t txg) } spa_config_exit(spa, SCL_STATE, FTAG); - VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); - tx = dmu_tx_create_assigned(dp, txg); /* @@ -4171,34 +5566,29 @@ spa_sync(spa_t *spa, uint64_t txg) } } - if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && - spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { - dsl_pool_create_origin(dp, tx); - - /* Keeping the origin open increases spa_minref */ - spa->spa_minref += 3; - } - - if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && - spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { - dsl_pool_upgrade_clones(dp, tx); - } - /* - * If anything has changed in this txg, push the deferred frees - * from the previous txg. If not, leave them alone so that we - * don't generate work on an otherwise idle system. + * If anything has changed in this txg, or if someone is waiting + * for this txg to sync (eg, spa_vdev_remove()), push the + * deferred frees from the previous txg. If not, leave them + * alone so that we don't generate work on an otherwise idle + * system. */ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || !txg_list_empty(&dp->dp_dirty_dirs, txg) || - !txg_list_empty(&dp->dp_sync_tasks, txg)) - spa_sync_deferred_frees(spa, txg); + !txg_list_empty(&dp->dp_sync_tasks, txg) || + ((dsl_scan_active(dp->dp_scan) || + txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { + zio_t *zio = zio_root(spa, NULL, NULL, 0); + VERIFY3U(bpobj_iterate(defer_bpo, + spa_free_sync_cb, zio, tx), ==, 0); + VERIFY3U(zio_wait(zio), ==, 0); + } /* * Iterate to convergence. */ do { - spa->spa_sync_pass++; + int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, @@ -4208,18 +5598,26 @@ spa_sync(spa_t *spa, uint64_t txg) spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); - dirty_vdevs = 0; - while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { - vdev_sync(vd, txg); - dirty_vdevs++; + if (pass <= SYNC_PASS_DEFERRED_FREE) { + zio_t *zio = zio_root(spa, NULL, NULL, 0); + bplist_iterate(free_bpl, spa_free_sync_cb, + zio, tx); + VERIFY(zio_wait(zio) == 0); + } else { + bplist_iterate(free_bpl, bpobj_enqueue_cb, + defer_bpo, tx); } - bplist_sync(bpl, tx); - } while (dirty_vdevs); + ddt_sync(spa, txg); + dsl_scan_sync(dp, tx); + + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + vdev_sync(vd, txg); - bplist_close(bpl); + if (pass == 1) + spa_sync_upgrades(spa, tx); - dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); + } while (dmu_objset_is_dirty(mos, txg)); /* * Rewrite the vdev configuration (which includes the uberblock) @@ -4242,9 +5640,8 @@ spa_sync(spa_t *spa, uint64_t txg) int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); - int c; - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; @@ -4291,10 +5688,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_ubsync = spa->spa_uberblock; - /* - * Clean up the ZIL records for the synced txg. - */ - dsl_pool_zil_clean(dp); + dsl_pool_sync_done(dp, txg); /* * Update usable space statistics. @@ -4302,6 +5696,8 @@ spa_sync(spa_t *spa, uint64_t txg) while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) vdev_sync_done(vd, txg); + spa_update_dspace(spa); + /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). @@ -4309,10 +5705,13 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); - ASSERT(bpl->bpl_queue == NULL); + + spa->spa_sync_pass = 0; spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_handle_ignored_writes(spa); + /* * If any async tasks have been requested, kick them off. */ @@ -4330,7 +5729,8 @@ spa_sync_allpools(void) spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) + if (spa_state(spa) != POOL_STATE_ACTIVE || + !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -4410,6 +5810,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) void spa_upgrade(spa_t *spa, uint64_t version) { + ASSERT(spa_writeable(spa)); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -4479,7 +5881,6 @@ spa_has_active_shared_spare(spa_t *spa) void spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) { -#if 0 #ifdef _KERNEL sysevent_t *ev; sysevent_attr_list_t *attr = NULL; @@ -4526,5 +5927,4 @@ done: sysevent_free_attr(attr); sysevent_free(ev); #endif -#endif } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c index 34050ef..0b8255e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -36,6 +35,7 @@ #include #ifdef _KERNEL #include +#include #endif /* @@ -74,7 +74,6 @@ spa_config_load(void) void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; - spa_t *spa; char *pathname; struct _buf *file; uint64_t fsize; @@ -88,25 +87,21 @@ spa_config_load(void) file = kobj_open_file(pathname); - if (file == (struct _buf *)-1) { - ZFS_LOG(1, "Cannot open %s.", pathname); - goto out; - } + kmem_free(pathname, MAXPATHLEN); + + if (file == (struct _buf *)-1) + return; - if (kobj_get_filesize(file, &fsize) != 0) { - ZFS_LOG(1, "Cannot get size of %s.", pathname); + if (kobj_get_filesize(file, &fsize) != 0) goto out; - } buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ - if (kobj_read_file(file, buf, fsize, 0) < 0) { - ZFS_LOG(1, "Cannot read %s.", pathname); + if (kobj_read_file(file, buf, fsize, 0) < 0) goto out; - } /* * Unpack the nvlist. @@ -114,8 +109,6 @@ spa_config_load(void) if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; - ZFS_LOG(1, "File %s loaded.", pathname); - /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. @@ -123,7 +116,6 @@ spa_config_load(void) mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { - if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; @@ -131,33 +123,27 @@ spa_config_load(void) if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; - spa = spa_add(nvpair_name(nvpair), NULL); - - /* - * We blindly duplicate the configuration here. If it's - * invalid, we will catch it when the pool is first opened. - */ - VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0); + (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: - kmem_free(pathname, MAXPATHLEN); if (buf != NULL) kmem_free(buf, fsize); - if (file != (struct _buf *)-1) - kobj_close_file(file); + + kobj_close_file(file); } static void spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { - int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; - char *buf, *temp; size_t buflen; + char *buf; vnode_t *vp; + int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; + char *temp; /* * If the nvlist is empty (NULL), then remove the old cachefile. @@ -328,6 +314,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; + uint64_t split_guid; if (vd == NULL) { vd = rvd; @@ -356,7 +343,15 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)) == 0); +#ifdef _KERNEL + hostid = zone_get_hostid(NULL); +#else /* _KERNEL */ + /* + * We're emulating the system's hostid in userland, so we can't use + * zone_get_hostid(). + */ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); +#endif /* _KERNEL */ if (hostid != 0) { VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid) == 0); @@ -376,12 +371,63 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL) == 0); vd = vd->vdev_top; /* label contains top config */ + } else { + /* + * Only add the (potentially large) split information + * in the mos config, and not in the vdev labels + */ + if (spa->spa_config_splitting != NULL) + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, + spa->spa_config_splitting) == 0); } - nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); + /* + * Add the top-level config. We even add this on pools which + * don't support holes in the namespace. + */ + vdev_top_config_generate(spa, config); + + /* + * If we're splitting, record the original pool's guid. + */ + if (spa->spa_config_splitting != NULL && + nvlist_lookup_uint64(spa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, + split_guid) == 0); + } + + nvroot = vdev_config_generate(spa, vd, getstats, 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + + ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh); + VERIFY(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0); + kmem_free(ddh, sizeof (ddt_histogram_t)); + + ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); + ddt_get_dedup_object_stats(spa, ddo); + VERIFY(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0); + kmem_free(ddo, sizeof (ddt_object_t)); + + dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); + ddt_get_dedup_stats(spa, dds); + VERIFY(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_STATS, + (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0); + kmem_free(dds, sizeof (ddt_stat_t)); + } + if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); @@ -416,10 +462,9 @@ spa_config_update(spa_t *spa, int what) */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ms_array == 0) { - vdev_init(tvd, txg); - vdev_config_dirty(tvd); - } + if (tvd->vdev_ms_array == 0) + vdev_metaslab_set_size(tvd); + vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c index e1ae491..282140b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -54,38 +53,6 @@ #include #include -/* - * This is a stripped-down version of strtoull, suitable only for converting - * lowercase hexidecimal numbers that don't overflow. - */ -#ifdef _KERNEL -uint64_t -_strtonum(const char *str, char **nptr) -{ - uint64_t val = 0; - char c; - int digit; - - while ((c = *str) != '\0') { - if (c >= '0' && c <= '9') - digit = c - '0'; - else if (c >= 'a' && c <= 'f') - digit = 10 + c - 'a'; - else - break; - - val *= 16; - val += digit; - - str++; - } - - if (nptr) - *nptr = (char *)str; - - return (val); -} -#endif /* * Convert a bookmark to a string. @@ -105,13 +72,13 @@ bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) static void name_to_bookmark(char *buf, zbookmark_t *zb) { - zb->zb_objset = _strtonum(buf, &buf); + zb->zb_objset = strtonum(buf, &buf); ASSERT(*buf == ':'); - zb->zb_object = _strtonum(buf + 1, &buf); + zb->zb_object = strtonum(buf + 1, &buf); ASSERT(*buf == ':'); - zb->zb_level = (int)_strtonum(buf + 1, &buf); + zb->zb_level = (int)strtonum(buf + 1, &buf); ASSERT(*buf == ':'); - zb->zb_blkid = _strtonum(buf + 1, &buf); + zb->zb_blkid = strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } #endif @@ -134,7 +101,7 @@ spa_log_error(spa_t *spa, zio_t *zio) * If we are trying to import a pool, ignore any errors, as we won't be * writing to the pool any time soon. */ - if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return; mutex_enter(&spa->spa_errlist_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c index b403ccb..942636b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -32,6 +31,7 @@ #include #include #include +#include "zfs_comutil.h" #ifdef _KERNEL #include #include @@ -103,7 +103,8 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) * Figure out maximum size of history log. We set it at * 1% of pool size, with a max of 32MB and min of 128KB. */ - shpp->sh_phys_max_off = spa_get_dspace(spa) / 100; + shpp->sh_phys_max_off = + metaslab_class_get_dspace(spa_normal_class(spa)) / 100; shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); @@ -187,8 +188,9 @@ spa_history_zone() /* * Write out a history event. */ +/*ARGSUSED*/ static void -spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; history_arg_t *hap = arg2; @@ -231,9 +233,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, gethrestime_sec()) == 0); - VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, - (uint64_t)crgetuid(cr)) == 0); - if (hap->ha_zone[0] != '\0') + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); + if (hap->ha_zone != NULL) VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, hap->ha_zone) == 0); #ifdef _KERNEL @@ -244,6 +245,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) hap->ha_log_type == LOG_CMD_NORMAL) { VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); + + zfs_dbgmsg("command: %s", history_str); } else { VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, hap->ha_event) == 0); @@ -251,6 +254,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) tx->tx_txg) == 0); VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, history_str) == 0); + + zfs_dbgmsg("internal %s pool:%s txg:%llu %s", + zfs_history_event_names[hap->ha_event], spa_name(spa), + (longlong_t)tx->tx_txg, history_str); + } VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); @@ -279,10 +287,10 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) kmem_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); - if (hap->ha_log_type == LOG_INTERNAL) { - kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN); - kmem_free(hap, sizeof (history_arg_t)); - } + strfree(hap->ha_history_str); + if (hap->ha_zone != NULL) + strfree(hap->ha_zone); + kmem_free(hap, sizeof (history_arg_t)); } /* @@ -291,15 +299,32 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) int spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) { - history_arg_t ha; + history_arg_t *ha; + int err = 0; + dmu_tx_t *tx; ASSERT(what != LOG_INTERNAL); - ha.ha_history_str = history_str; - ha.ha_log_type = what; - (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone)); - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync, - spa, &ha, 0)); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); + ha->ha_history_str = strdup(history_str); + ha->ha_zone = strdup(spa_history_zone()); + ha->ha_log_type = what; + ha->ha_uid = crgetuid(CRED()); + + /* Kick this off asynchronously; errors are ignored. */ + dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, + spa_history_log_sync, spa, ha, 0, tx); + dmu_tx_commit(tx); + + /* spa_history_log_sync will free ha and strings */ + return (err); } /* @@ -322,6 +347,14 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) if (!spa->spa_history) return (ENOENT); + /* + * The history is logged asynchronously, so when they request + * the first chunk of history, make sure everything has been + * synced to disk so that we get it. + */ + if (*offp == 0 && spa_writeable(spa)) + txg_wait_synced(spa_get_dsl(spa), 0); + if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); shpp = dbp->db_data; @@ -391,13 +424,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) return (err); } -void -spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +static void +log_internal(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, const char *fmt, va_list adx) { - history_arg_t *hap; - char *str; - va_list adx; + history_arg_t *ha; + va_list adx2; /* * If this is part of creating a pool, not everything is @@ -406,23 +438,71 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, if (tx->tx_txg == TXG_INITIAL) return; - hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); - str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + va_copy(adx2, adx); - va_start(adx, fmt); - (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx); - va_end(adx); + ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); + ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx2) + 1, + KM_SLEEP); + + va_end(adx2); + + (void) vsprintf(ha->ha_history_str, fmt, adx); - hap->ha_log_type = LOG_INTERNAL; - hap->ha_history_str = str; - hap->ha_event = event; - hap->ha_zone[0] = '\0'; + ha->ha_log_type = LOG_INTERNAL; + ha->ha_event = event; + ha->ha_zone = NULL; + ha->ha_uid = 0; if (dmu_tx_is_syncing(tx)) { - spa_history_log_sync(spa, hap, cr, tx); + spa_history_log_sync(spa, ha, tx); } else { dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, - spa_history_log_sync, spa, hap, 0, tx); + spa_history_log_sync, spa, ha, 0, tx); } - /* spa_history_log_sync() will free hap and str */ + /* spa_history_log_sync() will free ha and strings */ +} + +void +spa_history_log_internal(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, const char *fmt, ...) +{ + dmu_tx_t *htx = tx; + va_list adx; + + /* create a tx if we didn't get one */ + if (tx == NULL) { + htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(htx, TXG_WAIT) != 0) { + dmu_tx_abort(htx); + return; + } + } + + va_start(adx, fmt); + log_internal(event, spa, htx, fmt, adx); + va_end(adx); + + /* if we didn't get a tx from the caller, commit the one we made */ + if (tx == NULL) + dmu_tx_commit(htx); +} + +void +spa_history_log_version(spa_t *spa, history_internal_events_t event) +{ +#ifdef _KERNEL + uint64_t current_vers = spa_version(spa); + + if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { + spa_history_log_internal(event, spa, NULL, + "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", + (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, + utsname.nodename, utsname.release, utsname.version, + utsname.machine); + } + cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", + event == LOG_POOL_IMPORT ? "imported" : + event == LOG_POOL_CREATE ? "created" : "accessed", + (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); +#endif } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 89e0301..1709f68 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -41,10 +40,11 @@ #include #include #include +#include #include #include -#include #include +#include #include "zfs_prop.h" /* @@ -186,7 +186,7 @@ * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial - * inquiries such as bp_get_dasize(). SCL_VDEV is distinct from the + * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * @@ -314,8 +314,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { + int wlocks_held = 0; + for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (scl->scl_writer == curthread) + wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); @@ -335,6 +339,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } + ASSERT(wlocks_held <= locks); } void @@ -419,7 +424,7 @@ spa_lookup(const char *name) * exist by calling spa_lookup() first. */ spa_t * -spa_add(const char *name, const char *altroot) +spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; @@ -429,29 +434,36 @@ spa_add(const char *name, const char *altroot) spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < TXG_SIZE; t++) + bplist_create(&spa->spa_free_bplist[t]); + (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; + spa->spa_load_max_txg = UINT64_MAX; + spa->spa_proc = &p0; + spa->spa_proc_state = SPA_PROC_NONE; refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); avl_add(&spa_namespace_avl, spa); - mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); - /* * Set the alternate root, if there is one. */ @@ -467,9 +479,15 @@ spa_add(const char *name, const char *altroot) offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); - dp->scd_path = spa_strdup(spa_config_path); + dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); + VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + + if (config != NULL) + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + return (spa); } @@ -486,6 +504,8 @@ spa_remove(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + nvlist_free(spa->spa_config_splitting); + avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); @@ -503,24 +523,30 @@ spa_remove(spa_t *spa) list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); + for (int t = 0; t < TXG_SIZE; t++) + bplist_destroy(&spa->spa_free_bplist[t]); + cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_scrub_lock); - mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_errlist_lock); - mutex_destroy(&spa->spa_sync_bplist.bpl_lock); + mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); + mutex_destroy(&spa->spa_vdev_top_lock); kmem_free(spa, sizeof (spa_t)); } @@ -814,12 +840,6 @@ spa_l2cache_activate(vdev_t *vd) mutex_exit(&spa_l2cache_lock); } -void -spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) -{ - vdev_space_update(vd, space, alloc, B_FALSE); -} - /* * ========================================================================== * SPA vdev locking @@ -834,7 +854,20 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) uint64_t spa_vdev_enter(spa_t *spa) { + mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + return (spa_vdev_config_enter(spa)); +} + +/* + * Internal implementation for spa_vdev_enter(). Used when a vdev + * operation requires multiple syncs (i.e. removing a device) while + * keeping the spa_namespace_lock held. + */ +uint64_t +spa_vdev_config_enter(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); @@ -842,14 +875,14 @@ spa_vdev_enter(spa_t *spa) } /* - * Unlock the spa_t after adding or removing a vdev. Besides undoing the - * locking of spa_vdev_enter(), we also want make sure the transactions have - * synced to disk, and then update the global configuration cache with the new - * information. + * Used in combination with spa_vdev_config_enter() to allow the syncing + * of multiple transactions without releasing the spa_namespace_lock. */ -int -spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +void +spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) { + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); @@ -861,17 +894,28 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); - /* - * If the config changed, notify the scrub thread that it must restart. - */ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { - dsl_pool_scrub_restart(spa->spa_dsl_pool); config_changed = B_TRUE; + spa->spa_config_generation++; } + /* + * Verify the metaslab classes. + */ + ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + spa_config_exit(spa, SCL_ALL, spa); /* + * Panic the system if the specified tag requires it. This + * is useful for ensuring that configurations are updated + * transactionally. + */ + if (zio_injection_enabled) + zio_handle_panic_injection(spa, tag, 0); + + /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. @@ -891,8 +935,20 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) */ if (config_changed) spa_config_sync(spa, B_FALSE, B_TRUE); +} +/* + * Unlock the spa_t after adding or removing a vdev. Besides undoing the + * locking of spa_vdev_enter(), we also want make sure the transactions have + * synced to disk, and then update the global configuration cache with the new + * information. + */ +int +spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +{ + spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); return (error); } @@ -901,18 +957,52 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) * Lock the given spa_t for the purpose of changing vdev state. */ void -spa_vdev_state_enter(spa_t *spa) +spa_vdev_state_enter(spa_t *spa, int oplocks) { - spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); + int locks = SCL_STATE_ALL | oplocks; + + /* + * Root pools may need to read of the underlying devfs filesystem + * when opening up a vdev. Unfortunately if we're holding the + * SCL_ZIO lock it will result in a deadlock when we try to issue + * the read from the root filesystem. Instead we "prefetch" + * the associated vnodes that we need prior to opening the + * underlying devices and cache them so that we can prevent + * any I/O when we are doing the actual open. + */ + if (spa_is_root(spa)) { + int low = locks & ~(SCL_ZIO - 1); + int high = locks & ~low; + + spa_config_enter(spa, high, spa, RW_WRITER); + vdev_hold(spa->spa_root_vdev); + spa_config_enter(spa, low, spa, RW_WRITER); + } else { + spa_config_enter(spa, locks, spa, RW_WRITER); + } + spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { - if (vd != NULL) + boolean_t config_changed = B_FALSE; + + if (vd != NULL || error == 0) + vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, + 0, 0, B_FALSE); + + if (vd != NULL) { vdev_state_dirty(vd->vdev_top); + config_changed = B_TRUE; + spa->spa_config_generation++; + } + + if (spa_is_root(spa)) + vdev_rele(spa->spa_root_vdev); - spa_config_exit(spa, SCL_STATE_ALL, spa); + ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); + spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, @@ -923,6 +1013,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); + /* + * If the config changed, update the config cache. + */ + if (config_changed) { + mutex_enter(&spa_namespace_lock); + spa_config_sync(spa, B_FALSE, B_TRUE); + mutex_exit(&spa_namespace_lock); + } + return (error); } @@ -982,14 +1081,13 @@ spa_rename(const char *name, const char *newname) return (0); } - /* - * Determine whether a pool with given pool_guid exists. If device_guid is - * non-zero, determine whether the pool exists *and* contains a device with the - * specified device_guid. + * Return the spa_t associated with given pool_guid, if it exists. If + * device_guid is non-zero, determine whether the pool exists *and* contains + * a device with the specified device_guid. */ -boolean_t -spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +spa_t * +spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; @@ -1020,7 +1118,16 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) } } - return (spa != NULL); + return (spa); +} + +/* + * Determine whether a pool with the given pool_guid exists. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + return (spa_by_guid(pool_guid, device_guid) != NULL); } char * @@ -1055,48 +1162,36 @@ spa_get_random(uint64_t range) return (r % range); } -void -sprintf_blkptr(char *buf, int len, const blkptr_t *bp) +uint64_t +spa_generate_guid(spa_t *spa) { - int d; + uint64_t guid = spa_get_random(-1ULL); - if (bp == NULL) { - (void) snprintf(buf, len, ""); - return; + if (spa != NULL) { + while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) + guid = spa_get_random(-1ULL); + } else { + while (guid == 0 || spa_guid_exists(guid, 0)) + guid = spa_get_random(-1ULL); } - if (BP_IS_HOLE(bp)) { - (void) snprintf(buf, len, ""); - return; - } + return (guid); +} + +void +sprintf_blkptr(char *buf, const blkptr_t *bp) +{ + char *type = NULL; + char *checksum = NULL; + char *compress = NULL; - (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ", - (u_longlong_t)BP_GET_LEVEL(bp), - dmu_ot[BP_GET_TYPE(bp)].ot_name, - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp)); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - const dva_t *dva = &bp->blk_dva[d]; - (void) snprintf(buf + strlen(buf), len - strlen(buf), - "DVA[%d]=<%llu:%llx:%llx> ", d, - (u_longlong_t)DVA_GET_VDEV(dva), - (u_longlong_t)DVA_GET_OFFSET(dva), - (u_longlong_t)DVA_GET_ASIZE(dva)); + if (bp != NULL) { + type = dmu_ot[BP_GET_TYPE(bp)].ot_name; + checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } - (void) snprintf(buf + strlen(buf), len - strlen(buf), - "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx", - zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, - zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, - BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", - BP_IS_GANG(bp) ? "gang" : "contiguous", - (u_longlong_t)bp->blk_birth, - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_cksum.zc_word[0], - (u_longlong_t)bp->blk_cksum.zc_word[1], - (u_longlong_t)bp->blk_cksum.zc_word[2], - (u_longlong_t)bp->blk_cksum.zc_word[3]); + SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); } void @@ -1233,59 +1328,55 @@ spa_first_txg(spa_t *spa) return (spa->spa_first_txg); } +uint64_t +spa_syncing_txg(spa_t *spa) +{ + return (spa->spa_syncing_txg); +} + pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } -uint64_t -spa_freeze_txg(spa_t *spa) +spa_load_state_t +spa_load_state(spa_t *spa) { - return (spa->spa_freeze_txg); + return (spa->spa_load_state); } -/* - * Return how much space is allocated in the pool (ie. sum of all asize) - */ uint64_t -spa_get_alloc(spa_t *spa) +spa_freeze_txg(spa_t *spa) { - return (spa->spa_root_vdev->vdev_stat.vs_alloc); + return (spa->spa_freeze_txg); } -/* - * Return how much (raid-z inflated) space there is in the pool. - */ +/* ARGSUSED */ uint64_t -spa_get_space(spa_t *spa) +spa_get_asize(spa_t *spa, uint64_t lsize) { - return (spa->spa_root_vdev->vdev_stat.vs_space); + /* + * The worst case is single-sector max-parity RAID-Z blocks, in which + * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) + * times the size; so just assume that. Add to this the fact that + * we can have up to 3 DVAs per bp, and one more factor of 2 because + * the block may be dittoed with up to 3 DVAs by ddt_sync(). + */ + return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); } -/* - * Return the amount of raid-z-deflated space in the pool. - */ uint64_t spa_get_dspace(spa_t *spa) { - if (spa->spa_deflate) - return (spa->spa_root_vdev->vdev_stat.vs_dspace); - else - return (spa->spa_root_vdev->vdev_stat.vs_space); + return (spa->spa_dspace); } -/* ARGSUSED */ -uint64_t -spa_get_asize(spa_t *spa, uint64_t lsize) +void +spa_update_dspace(spa_t *spa) { - /* - * For now, the worst case is 512-byte RAID-Z blocks, in which - * case the space requirement is exactly 2x; so just assume that. - * Add to this the fact that we can have up to 3 DVAs per bp, and - * we have to multiply by a total of 6x. - */ - return (lsize * 6); + spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + + ddt_get_dedup_dspace(spa); } /* @@ -1310,6 +1401,24 @@ spa_version(spa_t *spa) return (spa->spa_ubsync.ub_version); } +boolean_t +spa_deflate(spa_t *spa) +{ + return (spa->spa_deflate); +} + +metaslab_class_t * +spa_normal_class(spa_t *spa) +{ + return (spa->spa_normal_class); +} + +metaslab_class_t * +spa_log_class(spa_t *spa) +{ + return (spa->spa_log_class); +} + int spa_max_replication(spa_t *spa) { @@ -1323,24 +1432,52 @@ spa_max_replication(spa_t *spa) return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } +int +spa_prev_software_version(spa_t *spa) +{ + return (spa->spa_prev_software_version); +} + uint64_t -bp_get_dasize(spa_t *spa, const blkptr_t *bp) +dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { - int sz = 0, i; + uint64_t asize = DVA_GET_ASIZE(dva); + uint64_t dsize = asize; - if (!spa->spa_deflate) - return (BP_GET_ASIZE(bp)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (i = 0; i < SPA_DVAS_PER_BP; i++) { - vdev_t *vd = - vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); - if (vd) - sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> - SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; + if (asize != 0 && spa->spa_deflate) { + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } + + return (dsize); +} + +uint64_t +bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + + return (dsize); +} + +uint64_t +bp_get_dsize(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + spa_config_exit(spa, SCL_VDEV, FTAG); - return (sz); + + return (dsize); } /* @@ -1442,9 +1579,18 @@ spa_has_slogs(spa_t *spa) return (spa->spa_log_class->mc_rotor != NULL); } -/* - * Return whether this pool is the root pool. - */ +spa_log_state_t +spa_get_log_state(spa_t *spa) +{ + return (spa->spa_log_state); +} + +void +spa_set_log_state(spa_t *spa, spa_log_state_t state) +{ + spa->spa_log_state = state; +} + boolean_t spa_is_root(spa_t *spa) { @@ -1462,3 +1608,69 @@ spa_mode(spa_t *spa) { return (spa->spa_mode); } + +uint64_t +spa_bootfs(spa_t *spa) +{ + return (spa->spa_bootfs); +} + +uint64_t +spa_delegation(spa_t *spa) +{ + return (spa->spa_delegation); +} + +objset_t * +spa_meta_objset(spa_t *spa) +{ + return (spa->spa_meta_objset); +} + +enum zio_checksum +spa_dedup_checksum(spa_t *spa) +{ + return (spa->spa_dedup_checksum); +} + +/* + * Reset pool scan stat per scan pass (or reboot). + */ +void +spa_scan_stat_init(spa_t *spa) +{ + /* data not stored on disk */ + spa->spa_scan_pass_start = gethrestime_sec(); + spa->spa_scan_pass_exam = 0; + vdev_scan_stat_init(spa->spa_root_vdev); +} + +/* + * Get scan stats for zpool status reports + */ +int +spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) +{ + dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; + + if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + return (ENOENT); + bzero(ps, sizeof (pool_scan_stat_t)); + + /* data stored on disk */ + ps->pss_func = scn->scn_phys.scn_func; + ps->pss_start_time = scn->scn_phys.scn_start_time; + ps->pss_end_time = scn->scn_phys.scn_end_time; + ps->pss_to_examine = scn->scn_phys.scn_to_examine; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_processed = scn->scn_phys.scn_processed; + ps->pss_errors = scn->scn_phys.scn_errors; + ps->pss_state = scn->scn_phys.scn_state; + + /* data not stored on disk */ + ps->pss_pass_start = spa->spa_scan_pass_start; + ps->pss_pass_exam = spa->spa_scan_pass_exam; + + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index d025141..1ce7b2a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -258,8 +258,10 @@ space_map_load_wait(space_map_t *sm) { ASSERT(MUTEX_HELD(sm->sm_lock)); - while (sm->sm_loading) + while (sm->sm_loading) { + ASSERT(!sm->sm_loaded); cv_wait(&sm->sm_load_cv, sm->sm_lock); + } } /* @@ -276,11 +278,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); - - space_map_load_wait(sm); - - if (sm->sm_loaded) - return (0); + ASSERT(!sm->sm_loaded); + ASSERT(!sm->sm_loading); sm->sm_loading = B_TRUE; end = smo->smo_objsize; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index f52851d..8f189c6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ARC_H @@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; - krwlock_t b_lock; + kmutex_t b_evict_lock; + krwlock_t b_data_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; @@ -87,10 +87,13 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); +void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); +int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb); int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); @@ -99,27 +102,16 @@ void arc_buf_thaw(arc_buf_t *buf); int arc_referenced(arc_buf_t *buf); #endif -typedef struct writeprops { - dmu_object_type_t wp_type; - uint8_t wp_level; - uint8_t wp_copies; - uint8_t wp_dncompress, wp_oscompress; - uint8_t wp_dnchecksum, wp_oschecksum; -} writeprops_t; - -int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, +int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb); -int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, +int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int flags, uint32_t *arc_flags, const zbookmark_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, - boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int zio_flags, const zbookmark_t *zb); -int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags); -int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); +zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); int arc_buf_evict(arc_buf_t *buf); @@ -135,7 +127,7 @@ void arc_fini(void); * Level 2 ARC */ -void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); +void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_init(void); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h index cdb93a6..471be90 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h @@ -19,68 +19,36 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_BPLIST_H #define _SYS_BPLIST_H -#include -#include -#include #include +#include #ifdef __cplusplus extern "C" { #endif -typedef struct bplist_phys { - /* - * This is the bonus buffer for the dead lists. The object's - * contents is an array of bpl_entries blkptr_t's, representing - * a total of bpl_bytes physical space. - */ - uint64_t bpl_entries; - uint64_t bpl_bytes; - uint64_t bpl_comp; - uint64_t bpl_uncomp; -} bplist_phys_t; - -#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t)) - -typedef struct bplist_q { - blkptr_t bpq_blk; - void *bpq_next; -} bplist_q_t; +typedef struct bplist_entry { + blkptr_t bpe_blk; + list_node_t bpe_node; +} bplist_entry_t; typedef struct bplist { kmutex_t bpl_lock; - objset_t *bpl_mos; - uint64_t bpl_object; - uint8_t bpl_blockshift; - uint8_t bpl_bpshift; - uint8_t bpl_havecomp; - bplist_q_t *bpl_queue; - bplist_phys_t *bpl_phys; - dmu_buf_t *bpl_dbuf; - dmu_buf_t *bpl_cached_dbuf; + list_t bpl_list; } bplist_t; -extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); -extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); -extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); -extern void bplist_close(bplist_t *bpl); -extern boolean_t bplist_empty(bplist_t *bpl); -extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); -extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx); -extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp); -extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); -extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); -extern int bplist_space(bplist_t *bpl, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -extern int bplist_space_birthrange(bplist_t *bpl, - uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep); +typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +void bplist_create(bplist_t *bpl); +void bplist_destroy(bplist_t *bpl); +void bplist_append(bplist_t *bpl, const blkptr_t *bp); +void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, + void *arg, dmu_tx_t *tx); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h new file mode 100644 index 0000000..3771a95 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_BPOBJ_H +#define _SYS_BPOBJ_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bpobj_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpo_entries blkptr_t's, representing + * a total of bpo_bytes physical space. + */ + uint64_t bpo_num_blkptrs; + uint64_t bpo_bytes; + uint64_t bpo_comp; + uint64_t bpo_uncomp; + uint64_t bpo_subobjs; + uint64_t bpo_num_subobjs; +} bpobj_phys_t; + +#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) + +typedef struct bpobj { + kmutex_t bpo_lock; + objset_t *bpo_os; + uint64_t bpo_object; + int bpo_epb; + uint8_t bpo_havecomp; + uint8_t bpo_havesubobj; + bpobj_phys_t *bpo_phys; + dmu_buf_t *bpo_dbuf; + dmu_buf_t *bpo_cached_dbuf; +} bpobj_t; + +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); +void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); + +int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); +void bpobj_close(bpobj_t *bpo); + +int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); +int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp); + +void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); + +int bpobj_space(bpobj_t *bpo, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPOBJ_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 7e2754d..cf1bbc0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DBUF_H #define _SYS_DBUF_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -35,12 +32,12 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #endif -#define DB_BONUS_BLKID (-1ULL) #define IN_DMU_SYNC 2 /* @@ -55,25 +52,28 @@ extern "C" { #define DB_RF_CACHED (1 << 5) /* - * The state transition diagram for dbufs looks like: + * The simplified state transition diagram for dbufs looks like: * * +----> READ ----+ * | | * | V * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ - * | | - * +----> FILL ----+ + * | ^ ^ + * | | | + * +----> FILL ----+ | + * | | + * | | + * +--------> NOFILL -------+ */ typedef enum dbuf_states { DB_UNCACHED, DB_FILL, + DB_NOFILL, DB_READ, DB_CACHED, DB_EVICTING } dbuf_states_t; -struct objset_impl; struct dnode; struct dmu_tx; @@ -83,9 +83,6 @@ struct dmu_tx; * etc. */ -#define LIST_LINK_INACTIVE(link) \ - ((link)->list_next == NULL && (link)->list_prev == NULL) - struct dmu_buf_impl; typedef enum override_states { @@ -132,6 +129,7 @@ typedef struct dbuf_dirty_record { arc_buf_t *dr_data; blkptr_t dr_overridden_by; override_states_t dr_override_state; + uint8_t dr_copies; } dl; } dt; } dbuf_dirty_record_t; @@ -146,18 +144,20 @@ typedef struct dmu_buf_impl { dmu_buf_t db; /* the objset we belong to */ - struct objset_impl *db_objset; + struct objset *db_objset; /* - * the dnode we belong to (NULL when evicted) + * handle to safely access the dnode we belong to (NULL when evicted) */ - struct dnode *db_dnode; + struct dnode_handle *db_dnode_handle; /* * our parent buffer; if the dnode points to us directly, - * db_parent == db_dnode->dn_dbuf + * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf * only accessed by sync thread ??? * (NULL when evicted) + * May change from NULL to non-NULL under the protection of db_mtx + * (see dbuf_check_blkptr()) */ struct dmu_buf_impl *db_parent; @@ -240,6 +240,10 @@ uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); void dbuf_create_bonus(struct dnode *dn); +int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); +void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag); + +void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, @@ -253,17 +257,19 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); uint64_t dbuf_refcount(dmu_buf_impl_t *db); void dbuf_rele(dmu_buf_impl_t *db, void *tag); +void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); @@ -271,30 +277,53 @@ void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, dmu_tx_t *tx); +void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); +#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) +#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) +#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) +#define DB_GET_SPA(_spa_p, _db) { \ + dnode_t *__dn; \ + DB_DNODE_ENTER(_db); \ + __dn = DB_DNODE(_db); \ + *(_spa_p) = __dn->dn_objset->os_spa; \ + DB_DNODE_EXIT(_db); \ +} +#define DB_GET_OBJSET(_os_p, _db) { \ + dnode_t *__dn; \ + DB_DNODE_ENTER(_db); \ + __dn = DB_DNODE(_db); \ + *(_os_p) = __dn->dn_objset; \ + DB_DNODE_EXIT(_db); \ +} + void dbuf_init(void); void dbuf_fini(void); -#define DBUF_IS_METADATA(db) \ - ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata) +boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); + +#define DBUF_IS_METADATA(_db) \ + (dbuf_is_metadata(_db)) -#define DBUF_GET_BUFC_TYPE(db) \ - (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) +#define DBUF_GET_BUFC_TYPE(_db) \ + (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) -#define DBUF_IS_CACHEABLE(db) \ - ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(db) && \ - ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) +#define DBUF_IS_CACHEABLE(_db) \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(_db) && \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2CACHEABLE(db) \ - ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(db) && \ - ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) +#define DBUF_IS_L2CACHEABLE(_db) \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(_db) && \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) #ifdef ZFS_DEBUG @@ -322,10 +351,10 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + sprintf_blkptr(__blkbuf, bp); \ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ + } \ _NOTE(CONSTCOND) } while (0) #define DBUF_VERIFY(db) dbuf_verify(db) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h new file mode 100644 index 0000000..9724d6e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DDT_H +#define _SYS_DDT_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On-disk DDT formats, in the desired search order (newest version first). + */ +enum ddt_type { + DDT_TYPE_ZAP = 0, + DDT_TYPES +}; + +/* + * DDT classes, in the desired search order (highest replication level first). + */ +enum ddt_class { + DDT_CLASS_DITTO = 0, + DDT_CLASS_DUPLICATE, + DDT_CLASS_UNIQUE, + DDT_CLASSES +}; + +#define DDT_TYPE_CURRENT 0 + +#define DDT_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_COMPRESS_FUNCTION_MASK 0x7f + +/* + * On-disk ddt entry: key (name) and physical storage (value). + */ +typedef struct ddt_key { + zio_cksum_t ddk_cksum; /* 256-bit block checksum */ + uint64_t ddk_prop; /* LSIZE, PSIZE, compression */ +} ddt_key_t; + +/* + * ddk_prop layout: + * + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | 0 | 0 | 0 | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ +#define DDK_GET_LSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_LSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_PSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_PSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8) +#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x) + +#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) + +typedef struct ddt_phys { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; +} ddt_phys_t; + +enum ddt_phys_type { + DDT_PHYS_DITTO = 0, + DDT_PHYS_SINGLE = 1, + DDT_PHYS_DOUBLE = 2, + DDT_PHYS_TRIPLE = 3, + DDT_PHYS_TYPES +}; + +/* + * In-core ddt entry + */ +struct ddt_entry { + ddt_key_t dde_key; + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + void *dde_repair_data; + enum ddt_type dde_type; + enum ddt_class dde_class; + uint8_t dde_loading; + uint8_t dde_loaded; + kcondvar_t dde_cv; + avl_node_t dde_node; +}; + +/* + * In-core ddt + */ +struct ddt { + kmutex_t ddt_lock; + avl_tree_t ddt_tree; + avl_tree_t ddt_repair_tree; + enum zio_checksum ddt_checksum; + spa_t *ddt_spa; + objset_t *ddt_os; + uint64_t ddt_stat_object; + uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; + ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; + avl_node_t ddt_node; +}; + +/* + * In-core and on-disk bookmark for DDT walks + */ +typedef struct ddt_bookmark { + uint64_t ddb_class; + uint64_t ddb_type; + uint64_t ddb_checksum; + uint64_t ddb_cursor; +} ddt_bookmark_t; + +/* + * Ops vector to access a specific DDT object type. + */ +typedef struct ddt_ops { + char ddt_op_name[32]; + int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, + boolean_t prehash); + int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); + void (*ddt_op_prefetch)(objset_t *os, uint64_t object, + ddt_entry_t *dde); + int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, + uint64_t *walk); + uint64_t (*ddt_op_count)(objset_t *os, uint64_t object); +} ddt_ops_t; + +#define DDT_NAMELEN 80 + +extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, char *name); +extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, uint64_t *walk, ddt_entry_t *dde); +extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type, + enum ddt_class class); +extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, dmu_object_info_t *); +extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, + enum ddt_class class); + +extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, + uint64_t txg); +extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_phys_t *ddp, blkptr_t *bp); + +extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); + +extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); +extern void ddt_phys_clear(ddt_phys_t *ddp); +extern void ddt_phys_addref(ddt_phys_t *ddp); +extern void ddt_phys_decref(ddt_phys_t *ddp); +extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, + uint64_t txg); +extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); + +extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); + +extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); +extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); +extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); +extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); +extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); +extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); + +extern uint64_t ddt_get_dedup_dspace(spa_t *spa); +extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); + +extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, + ddt_phys_t *ddp_willref); +extern int ddt_ditto_copies_present(ddt_entry_t *dde); + +extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); +extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); + +extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); +extern void ddt_enter(ddt_t *ddt); +extern void ddt_exit(ddt_t *ddt); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); +extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); + +extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, + const blkptr_t *bp); + +extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); +extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); + +extern int ddt_entry_compare(const void *x1, const void *x2); + +extern void ddt_create(spa_t *spa); +extern int ddt_load(spa_t *spa); +extern void ddt_unload(spa_t *spa); +extern void ddt_sync(spa_t *spa, uint64_t txg); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); +extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx); + +extern const ddt_ops_t ddt_zap_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DDT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 08c30c8..4f91a91 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -19,15 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_DMU_H #define _SYS_DMU_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file describes the interface that the DMU provides for its * consumers. @@ -39,12 +38,14 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #endif struct uio; +struct xuio; struct page; struct vnode; struct spa; @@ -60,8 +61,9 @@ struct drr_end; struct zbookmark; struct spa; struct nvlist; -struct objset_impl; struct arc_buf; +struct zio_prop; +struct sa_handle; struct file; typedef struct objset objset_t; @@ -75,8 +77,8 @@ typedef enum dmu_object_type { DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ - DMU_OT_BPLIST, /* UINT64 */ - DMU_OT_BPLIST_HDR, /* UINT64 */ + DMU_OT_BPOBJ, /* UINT64 */ + DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ @@ -116,9 +118,22 @@ typedef enum dmu_object_type { DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ - DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_SA, /* System attr */ + DMU_OT_SA_MASTER_NODE, /* ZAP */ + DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ + DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ + DMU_OT_SCAN_XLATE, /* ZAP */ + DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ + DMU_OT_DEADLIST, /* ZAP */ + DMU_OT_DEADLIST_HDR, /* UINT64 */ + DMU_OT_DSL_CLONES, /* ZAP */ + DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -141,16 +156,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); -#define DS_MODE_NOHOLD 0 /* internal use only */ -#define DS_MODE_USER 1 /* simple access, no special needs */ -#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */ -#define DS_MODE_TYPE_MASK 0x3 -#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK) -#define DS_MODE_READONLY 0x8 -#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) -#define DS_MODE_INCONSISTENT 0x10 -#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT) - #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) @@ -163,27 +168,35 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) +#define DMU_DEADLIST_OBJECT (-3ULL) /* + * artificial blkids for bonus buffer and spill blocks + */ +#define DMU_BONUS_BLKID (-1ULL) +#define DMU_SPILL_BLKID (-2ULL) +/* * Public routines to create, destroy, open, and close objsets. */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type, - objset_t **osp); -void dmu_objset_close(objset_t *os); +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); + int dmu_objset_evict_dbufs(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_snapshots_destroy(char *fsname, char *snapname); -int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, - boolean_t recursive); +int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, + uint64_t flags); +int dmu_objset_destroy(const char *name, boolean_t defer); +int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); +int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); int dmu_objset_rename(const char *name, const char *newname, boolean_t recursive); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, +int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); @@ -202,7 +215,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_ROOT_DATASET "root_dataset" -#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" @@ -210,19 +223,12 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" - -/* 4x8 zbookmark_t */ -#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" -/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ -#define DMU_POOL_SCRUB_QUEUE "scrub_queue" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg" -/* 1x4 enum scrub_func */ -#define DMU_POOL_SCRUB_FUNC "scrub_func" -/* 1x8 count */ -#define DMU_POOL_SCRUB_ERRORS "scrub_errors" +#define DMU_POOL_TMP_USERREFS "tmp_userrefs" +#define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_CREATION_VERSION "creation_version" +#define DMU_POOL_SCAN "scan" +#define DMU_POOL_FREE_BPOBJ "free_bpobj" /* * Allocate an object from this objset. The range of object numbers @@ -307,11 +313,14 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); /* - * Decide how many copies of a given block we should make. Can be from - * 1 to SPA_DVAS_PER_BP. + * Decide how to write a block: checksum, compression, number of copies, etc. */ -int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, - dmu_object_type_t ot); +#define WP_NOFILL 0x1 +#define WP_DMU_SYNC 0x2 +#define WP_SPILL 0x4 + +void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, + struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a @@ -325,6 +334,18 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); +int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); +dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); +int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); + +/* + * Special spill buffer support used by "SA" framework + */ + +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, + void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the @@ -341,7 +362,7 @@ int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **); + void *tag, dmu_buf_t **, int flags); void dmu_buf_add_ref(dmu_buf_t *db, void* tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); @@ -438,12 +459,35 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); +void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); /* + * To register a commit callback, dmu_tx_callback_register() must be called. + * + * dcb_data is a pointer to caller private data that is passed on as a + * callback parameter. The caller is responsible for properly allocating and + * freeing it. + * + * When registering a callback, the transaction must be already created, but + * it cannot be committed or aborted. It can be assigned to a txg or not. + * + * The callback will be called after the transaction has been safely written + * to stable storage and will also be called if the dmu_tx is aborted. + * If there is any error which prevents the transaction from being committed to + * disk, the callback will be called with a value of error != 0. + */ +typedef void dmu_tx_callback_func_t(void *dcb_data, int error); + +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); + +/* * Free up the data blocks for a defined range of a file. If size is * zero, the range from offset to end-of-file is freed. */ @@ -465,15 +509,28 @@ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); +void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); +int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, + dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); +int dmu_xuio_init(struct xuio *uio, int niov); +void dmu_xuio_fini(struct xuio *uio); +int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, + size_t n); +int dmu_xuio_cnt(struct xuio *uio); +struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); +void dmu_xuio_clear(struct xuio *uio, int i); +void xuio_stat_wbuf_copied(); +void xuio_stat_wbuf_nocopy(); extern int zfs_prefetch_disable; @@ -484,19 +541,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len); typedef struct dmu_object_info { - /* All sizes are in bytes. */ + /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; - uint64_t doi_bonus_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; + uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_pad[5]; - /* Values below are number of 512-byte blocks. */ - uint64_t doi_physical_blks; /* data + metadata */ - uint64_t doi_max_block_offset; + uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ + uint64_t doi_max_offset; + uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); @@ -565,6 +622,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, */ uint64_t dmu_objset_fsid_guid(objset_t *os); +/* + * Get the [cm]time for an objset's snapshot dir + */ +timestruc_t dmu_objset_snap_cmtime(objset_t *os); + int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); @@ -574,6 +636,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); +extern uint64_t dmu_objset_syncprop(objset_t *os); +extern uint64_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, @@ -581,9 +645,8 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); -typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype, - void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused, - dmu_tx_t *tx); +typedef int objset_used_cb_t(dmu_object_type_t bonustype, + void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); @@ -604,9 +667,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ -typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg); -int dmu_sync(struct zio *zio, dmu_buf_t *db, - struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg); + +/* + * {zfs,zvol,ztest}_get_done() args + */ +typedef struct zgd { + struct zilog *zgd_zilog; + struct blkptr *zgd_bp; + dmu_buf_t *zgd_db; + struct rl *zgd_rl; + void *zgd_private; +} zgd_t; + +typedef void dmu_sync_cb_t(zgd_t *arg, int error); +int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off @@ -641,15 +715,19 @@ typedef struct dmu_recv_cookie { struct dsl_dataset *drc_real_ds; struct drr_begin *drc_drrb; char *drc_tosnap; + char *drc_top_ds; boolean_t drc_newfs; boolean_t drc_force; } dmu_recv_cookie_t; -int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *); -int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp); +int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, + boolean_t force, objset_t *origin, dmu_recv_cookie_t *); +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep); int dmu_recv_end(dmu_recv_cookie_t *drc); -void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc); + +int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct file *fp, + offset_t *off); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index 96ce688..2cb7f12 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,6 +29,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -210,11 +211,11 @@ extern "C" { * * ds_lock * protects: - * ds_user_ptr - * ds_user_evice_func + * ds_objset * ds_open_refcount * ds_snapname * ds_phys accounting + * ds_phys userrefs zapobj * ds_reserved * held from: * dsl_dataset_* @@ -232,6 +233,39 @@ extern "C" { struct objset; struct dmu_pool; +typedef struct dmu_xuio { + int next; + int cnt; + struct arc_buf **bufs; + iovec_t *iovp; +} dmu_xuio_t; + +typedef struct xuio_stats { + /* loaned yet not returned arc_buf */ + kstat_named_t xuiostat_onloan_rbuf; + kstat_named_t xuiostat_onloan_wbuf; + /* whether a copy is made when loaning out a read buffer */ + kstat_named_t xuiostat_rbuf_copied; + kstat_named_t xuiostat_rbuf_nocopy; + /* whether a copy is made when assigning a write buffer */ + kstat_named_t xuiostat_wbuf_copied; + kstat_named_t xuiostat_wbuf_nocopy; +} xuio_stats_t; + +static xuio_stats_t xuio_stats = { + { "onloan_read_buf", KSTAT_DATA_UINT64 }, + { "onloan_write_buf", KSTAT_DATA_UINT64 }, + { "read_buf_copied", KSTAT_DATA_UINT64 }, + { "read_buf_nocopy", KSTAT_DATA_UINT64 }, + { "write_buf_copied", KSTAT_DATA_UINT64 }, + { "write_buf_nocopy", KSTAT_DATA_UINT64 } +}; + +#define XUIOSTAT_INCR(stat, val) \ + atomic_add_64(&xuio_stats.stat.value.ui64, (val)) +#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) + + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h index a8022d2..d687642 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_DMU_OBJSET_H #define _SYS_DMU_OBJSET_H @@ -33,18 +34,23 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #endif +extern krwlock_t os_lock; + struct dsl_dataset; struct dmu_tx; -struct objset_impl; #define OBJSET_PHYS_SIZE 2048 #define OBJSET_OLD_PHYS_SIZE 1024 +#define OBJSET_BUF_HAS_USERUSED(buf) \ + (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) + #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { @@ -59,26 +65,32 @@ typedef struct objset_phys { } objset_phys_t; struct objset { - struct objset_impl *os; - int os_mode; -}; - -typedef struct objset_impl { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; - dnode_t *os_meta_dnode; - dnode_t *os_userused_dnode; - dnode_t *os_groupused_dnode; + /* + * The following "special" dnodes have no parent and are exempt from + * dnode_move(), but they root their descendents in this objset using + * handles anyway, so that all access to dnodes from dbufs consistently + * uses handles. + */ + dnode_handle_t os_meta_dnode; + dnode_handle_t os_userused_dnode; + dnode_handle_t os_groupused_dnode; zilog_t *os_zil; - objset_t os; - uint8_t os_checksum; /* can change, under dsl_dir's locks */ - uint8_t os_compress; /* can change, under dsl_dir's locks */ - uint8_t os_copies; /* can change, under dsl_dir's locks */ - uint8_t os_primary_cache; /* can change, under dsl_dir's locks */ - uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */ + + /* can change, under dsl_dir's locks: */ + uint8_t os_checksum; + uint8_t os_compress; + uint8_t os_copies; + uint8_t os_dedup_checksum; + uint8_t os_dedup_verify; + uint8_t os_logbias; + uint8_t os_primary_cache; + uint8_t os_secondary_cache; + uint8_t os_sync; /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ @@ -101,51 +113,69 @@ typedef struct objset_impl { /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; -} objset_impl_t; + /* SA layout/attribute registration */ + sa_os_t *os_sa; +}; + +#define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 #define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) +#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) +#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) +#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ (os)->os_secondary_cache == ZFS_CACHE_METADATA) /* called from zpl */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -void dmu_objset_close(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); + +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, - boolean_t recursive); +int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, + uint64_t flags); +int dmu_objset_destroy(const char *name, boolean_t defer); +int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dmu_objset_fsid_guid(objset_t *os); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, +int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags); int dmu_objset_find_spa(spa_t *spa, const char *name, int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); -int dmu_objset_prefetch(char *name, void *arg); +int dmu_objset_prefetch(const char *name, void *arg); void dmu_objset_byteswap(void *buf, size_t size); int dmu_objset_evict_dbufs(objset_t *os); +timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ -void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); -objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, +void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); +boolean_t dmu_objset_is_dirty_anywhere(objset_t *os); +objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, - objset_impl_t **osip); -void dmu_objset_evict(struct dsl_dataset *ds, void *arg); -void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx); -boolean_t dmu_objset_userused_enabled(objset_impl_t *os); + objset_t **osp); +void dmu_objset_evict(objset_t *os); +void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); +boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); +void dmu_objset_init(void); +void dmu_objset_fini(void); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h index 3e02689..5b326cd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -36,19 +35,27 @@ extern "C" { struct dnode_phys; struct dsl_dataset; +struct zilog; +struct arc_buf; -typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, - const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); +typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp, + void *arg); #define TRAVERSE_PRE (1<<0) #define TRAVERSE_POST (1<<1) #define TRAVERSE_PREFETCH_METADATA (1<<2) #define TRAVERSE_PREFETCH_DATA (1<<3) #define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) +#define TRAVERSE_HARD (1<<4) -int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, - int flags, blkptr_cb_t func, void *arg); -int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg); +/* Special traverse error return value to indicate skipping of children */ +#define TRAVERSE_VISIT_NO_CHILDREN -1 + +int traverse_dataset(struct dsl_dataset *ds, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h index 6aaf35d..bbc6634 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DMU_TX_H #define _SYS_DMU_TX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -58,6 +56,7 @@ struct dmu_tx { txg_handle_t tx_txgh; void *tx_tempreserve_cookie; struct dmu_tx_hold *tx_needassign_txh; + list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */ uint8_t tx_anyobj; int tx_err; #ifdef ZFS_DEBUG @@ -77,6 +76,7 @@ enum dmu_tx_hold_type { THT_FREE, THT_ZAP, THT_SPACE, + THT_SPILL, THT_NUMTYPES }; @@ -97,6 +97,11 @@ typedef struct dmu_tx_hold { #endif } dmu_tx_hold_t; +typedef struct dmu_tx_callback { + list_node_t dcb_node; /* linked to tx_callbacks list */ + dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ + void *dcb_data; /* caller private data */ +} dmu_tx_callback_t; /* * These routines are defined in dmu.h, and are called by the user. @@ -108,6 +113,10 @@ void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); +void dmu_tx_do_callbacks(list_t *cb_list, int error); + /* * These routines are defined in dmu_spa.h, and are called by the SPA. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h index 48e4da8..9ad4be3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DNODE_H @@ -33,6 +32,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -63,6 +63,18 @@ extern "C" { #define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ /* + * dnode id flags + * + * Note: a file will never ever have its + * ids moved from bonus->spill + * and only in a crypto environment would it be on spill + */ +#define DN_ID_CHKED_BONUS 0x1 +#define DN_ID_CHKED_SPILL 0x2 +#define DN_ID_OLD_EXIST 0x4 +#define DN_ID_NEW_EXIST 0x8 + +/* * Derived constants. */ #define DNODE_SIZE (1 << DNODE_SHIFT) @@ -70,10 +82,12 @@ extern "C" { #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) #define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) #define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) +#define DN_KILL_SPILLBLK (1) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) +#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ @@ -88,7 +102,7 @@ extern "C" { #define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) struct dmu_buf_impl; -struct objset_impl; +struct objset; struct zio; enum dnode_dirtycontext { @@ -101,6 +115,9 @@ enum dnode_dirtycontext { #define DNODE_FLAG_USED_BYTES (1<<0) #define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) +/* Does dnode have a SA spill blkptr in bonus? */ +#define DNODE_FLAG_SPILL_BLKPTR (1<<2) + typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ @@ -121,7 +138,8 @@ typedef struct dnode_phys { uint64_t dn_pad3[4]; blkptr_t dn_blkptr[1]; - uint8_t dn_bonus[DN_MAX_BONUSLEN]; + uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)]; + blkptr_t dn_spill; } dnode_phys_t; typedef struct dnode { @@ -136,9 +154,10 @@ typedef struct dnode { list_node_t dn_link; /* immutable: */ - struct objset_impl *dn_objset; + struct objset *dn_objset; uint64_t dn_object; struct dmu_buf_impl *dn_dbuf; + struct dnode_handle *dn_handle; dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ /* @@ -155,15 +174,21 @@ typedef struct dnode { uint8_t dn_nlevels; uint8_t dn_indblkshift; uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint8_t dn_moved; /* Has this dnode been moved? */ uint16_t dn_datablkszsec; /* in 512b sectors */ uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; + uint8_t dn_next_bonustype[TXG_SIZE]; + uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ + uint32_t dn_dbufs_count; /* count of dn_dbufs */ + /* protected by os_lock: */ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ @@ -183,33 +208,60 @@ typedef struct dnode { refcount_t dn_holds; kmutex_t dn_dbufs_mtx; - list_t dn_dbufs; /* linked list of descendent dbuf_t's */ + list_t dn_dbufs; /* descendent dbufs */ + + /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + boolean_t dn_have_spill; /* have spill or are spilling */ + /* parent IO for current sync write */ zio_t *dn_zio; /* used in syncing context */ - dnode_phys_t *dn_oldphys; + uint64_t dn_oldused; /* old phys used bytes */ + uint64_t dn_oldflags; /* old phys dn_flags */ + uint64_t dn_olduid, dn_oldgid; + uint64_t dn_newuid, dn_newgid; + int dn_id_flags; /* holds prefetch structure */ struct zfetch dn_zfetch; } dnode_t; +/* + * Adds a level of indirection between the dbuf and the dnode to avoid + * iterating descendent dbufs in dnode_move(). Handles are not allocated + * individually, but as an array of child dnodes in dnode_hold_impl(). + */ +typedef struct dnode_handle { + /* Protects dnh_dnode from modification by dnode_move(). */ + zrlock_t dnh_zrlock; + dnode_t *dnh_dnode; +} dnode_handle_t; + +typedef struct dnode_children { + size_t dnc_count; /* number of children */ + dnode_handle_t dnc_children[1]; /* sized dynamically */ +} dnode_children_t; + typedef struct free_range { avl_node_t fr_node; uint64_t fr_blkid; uint64_t fr_nblks; } free_range_t; -dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, - uint64_t object); -void dnode_special_close(dnode_t *dn); +dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, + uint64_t object, dnode_handle_t *dnh); +void dnode_special_close(dnode_handle_t *dnh); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); -int dnode_hold(struct objset_impl *dd, uint64_t object, +void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); +void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); + +int dnode_hold(struct objset *dd, uint64_t object, void *ref, dnode_t **dnp); -int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, +int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index a1c2896..22733d0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -33,6 +32,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -42,8 +42,6 @@ struct dsl_dataset; struct dsl_dir; struct dsl_pool; -typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); - #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) @@ -63,6 +61,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); #define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) /* + * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called + * on a dataset. This allows the dataset to be destroyed using 'zfs release'. + */ +#define DS_FLAG_DEFER_DESTROY (1ULL<<3) +#define DS_IS_DEFER_DESTROY(ds) \ + ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY) + +/* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. */ @@ -77,7 +83,7 @@ typedef struct dsl_dataset_phys { uint64_t ds_num_children; /* clone/snap children; ==0 for head */ uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; - uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */ + uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ uint64_t ds_used_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; @@ -93,7 +99,8 @@ typedef struct dsl_dataset_phys { blkptr_t ds_bp; uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ - uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */ + uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ + uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ } dsl_dataset_phys_t; typedef struct dsl_dataset { @@ -106,10 +113,13 @@ typedef struct dsl_dataset { /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; - uint64_t ds_origin_txg; /* has internal locking: */ - bplist_t ds_deadlist; + dsl_deadlist_t ds_deadlist; + bplist_t ds_pending_deadlist; + + /* to protect against multiple concurrent incremental recv */ + kmutex_t ds_recvlock; /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; @@ -120,8 +130,8 @@ typedef struct dsl_dataset { * Protected by ds_lock: */ kmutex_t ds_lock; - void *ds_user_ptr; - dsl_dataset_evict_func_t *ds_user_evict_func; + objset_t *ds_objset; + uint64_t ds_userrefs; /* * ds_owner is protected by the ds_rwlock and the ds_lock @@ -143,7 +153,32 @@ typedef struct dsl_dataset { char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; -#define dsl_dataset_is_snapshot(ds) \ +struct dsl_ds_destroyarg { + dsl_dataset_t *ds; /* ds to destroy */ + dsl_dataset_t *rm_origin; /* also remove our origin? */ + boolean_t is_origin_rm; /* set if removing origin snap */ + boolean_t defer; /* destroy -d requested? */ + boolean_t releasing; /* destroying due to release? */ + boolean_t need_prep; /* do we need to retry due to EBUSY? */ +}; + +/* + * The max length of a temporary tag prefix is the number of hex digits + * required to express UINT64_MAX plus one for the hyphen. + */ +#define MAX_TAG_PREFIX_LEN 17 + +struct dsl_ds_holdarg { + dsl_sync_task_group_t *dstg; + char *htag; + char *snapname; + boolean_t recursive; + boolean_t gotone; + boolean_t temphold; + char failed[MAXPATHLEN]; +}; + +#define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) #define DS_UNIQUE_IS_ACCURATE(ds) \ @@ -152,36 +187,43 @@ typedef struct dsl_dataset { int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, dsl_dataset_t **); -int dsl_dataset_own(const char *name, int flags, void *owner, - dsl_dataset_t **dsp); +int dsl_dataset_own(const char *name, boolean_t inconsistentok, + void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, - int flags, void *owner, dsl_dataset_t **); + boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp); void dsl_dataset_name(dsl_dataset_t *ds, char *name); void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -void dsl_dataset_disown(dsl_dataset_t *ds, void *owner); +void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, - void *owner); -void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner); + void *tag); +void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); +void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, + minor_t minor); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx); -int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag); -int dsl_snapshots_destroy(char *fsname, char *snapname); +int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); +int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); dsl_checkfunc_t dsl_dataset_destroy_check; dsl_syncfunc_t dsl_dataset_destroy_sync; dsl_checkfunc_t dsl_dataset_snapshot_check; dsl_syncfunc_t dsl_dataset_snapshot_sync; -int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost); +dsl_syncfunc_t dsl_dataset_user_hold_sync; int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); -int dsl_dataset_promote(const char *name); +int dsl_dataset_promote(const char *name, char *conflsnap); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); - -void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, - void *p, dsl_dataset_evict_func_t func); -void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); +int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive, boolean_t temphold, int cleanup_fd); +int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, + boolean_t temphold); +int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive); +int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, + char *htag, boolean_t retry); +int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); @@ -192,10 +234,12 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); -void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, +void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); -boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, + dmu_tx_t *tx, boolean_t async); +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, + uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); @@ -211,13 +255,13 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv); -int dsl_dataset_set_quota(const char *dsname, uint64_t quota); -void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx); -int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation); -void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags); -int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation, - dmu_tx_t *tx); +int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, + uint64_t quota); +dsl_syncfunc_t dsl_dataset_set_quota_sync; +int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, + uint64_t reservation); + +int dsl_destroy_inconsistent(const char *dsname, void *arg); #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h new file mode 100644 index 0000000..d2c16d7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DSL_DEADLIST_H +#define _SYS_DSL_DEADLIST_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf; +struct dsl_dataset; + +typedef struct dsl_deadlist_phys { + uint64_t dl_used; + uint64_t dl_comp; + uint64_t dl_uncomp; + uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ +} dsl_deadlist_phys_t; + +typedef struct dsl_deadlist { + objset_t *dl_os; + uint64_t dl_object; + avl_tree_t dl_tree; + boolean_t dl_havetree; + struct dmu_buf *dl_dbuf; + dsl_deadlist_phys_t *dl_phys; + kmutex_t dl_lock; + + /* if it's the old on-disk format: */ + bpobj_t dl_bpobj; + boolean_t dl_oldfmt; +} dsl_deadlist_t; + +typedef struct dsl_deadlist_entry { + avl_node_t dle_node; + uint64_t dle_mintxg; + bpobj_t dle_bpobj; +} dsl_deadlist_entry_t; + +void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); +void dsl_deadlist_close(dsl_deadlist_t *dl); +uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); +void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx); +void dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_space_range(dsl_deadlist_t *dl, + uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); +void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DEADLIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h index b064c92..73c43bd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DELEG_H @@ -53,6 +52,9 @@ extern "C" { #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" /* * Note: the names of properties that are marked delegatable are also @@ -62,6 +64,7 @@ extern "C" { int dsl_deleg_get(const char *ddname, nvlist_t **nvp); int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); +int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h index 56d0638..2191635 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -70,7 +69,8 @@ typedef struct dsl_dir_phys { uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ uint64_t dd_flags; uint64_t dd_used_breakdown[DD_USED_NUM]; - uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */ + uint64_t dd_clones; /* dsl_dir objects */ + uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ } dsl_dir_phys_t; struct dsl_dir { @@ -89,6 +89,8 @@ struct dsl_dir { /* Protected by dd_lock */ kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ + timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ + uint64_t dd_origin_txg; /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; @@ -125,18 +127,24 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); -int dsl_dir_set_quota(const char *ddname, uint64_t quota); -int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); +int dsl_dir_set_quota(const char *ddname, zprop_source_t source, + uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation); int dsl_dir_rename(dsl_dir_t *dd, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); +void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); +timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" +#define XLATION_DIR_NAME "$XLATION" +#define FREE_DIR_NAME "$FREE" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h index d8da295..7d25bd7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_POOL_H @@ -32,6 +31,9 @@ #include #include #include +#include +#include +#include #ifdef __cplusplus extern "C" { @@ -42,12 +44,7 @@ struct dsl_dir; struct dsl_dataset; struct dsl_pool; struct dmu_tx; - -enum scrub_func { - SCRUB_FUNC_NONE, - SCRUB_FUNC_CLEAN, - SCRUB_FUNC_NUMFUNCS -}; +struct dsl_scan; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE @@ -75,6 +72,7 @@ typedef struct dsl_pool { struct objset *dp_meta_objset; struct dsl_dir *dp_root_dir; struct dsl_dir *dp_mos_dir; + struct dsl_dir *dp_free_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_vnrele_taskq; @@ -83,25 +81,18 @@ typedef struct dsl_pool { blkptr_t dp_meta_rootbp; list_t dp_synced_datasets; hrtime_t dp_read_overhead; - uint64_t dp_throughput; + uint64_t dp_throughput; /* bytes per millisec */ uint64_t dp_write_limit; + uint64_t dp_tmp_userrefs_obj; + bpobj_t dp_free_bpobj; + + struct dsl_scan *dp_scan; /* Uses dp_lock */ kmutex_t dp_lock; uint64_t dp_space_towrite[TXG_SIZE]; uint64_t dp_tempreserved[TXG_SIZE]; - enum scrub_func dp_scrub_func; - uint64_t dp_scrub_queue_obj; - uint64_t dp_scrub_min_txg; - uint64_t dp_scrub_max_txg; - zbookmark_t dp_scrub_bookmark; - boolean_t dp_scrub_pausing; - boolean_t dp_scrub_isresilver; - uint64_t dp_scrub_start_time; - kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ - boolean_t dp_scrub_restart; - /* Has its own locking */ tx_state_t dp_tx; txg_list_t dp_dirty_datasets; @@ -123,29 +114,36 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); void dsl_pool_close(dsl_pool_t *dp); dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); -void dsl_pool_zil_clean(dsl_pool_t *dp); +void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); +uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_memory_pressure(dsl_pool_t *dp); void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, - zio_done_func_t *done, void *private, uint32_t arc_flags); -void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, - struct dmu_tx *tx); +void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); +void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, + const blkptr_t *bpp); +int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); +int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); - -int dsl_pool_scrub_cancel(dsl_pool_t *dp); -int dsl_pool_scrub_clean(dsl_pool_t *dp); -void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_scrub_restart(dsl_pool_t *dp); +void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); +extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t *now, dmu_tx_t *tx); +extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, dmu_tx_t *tx); +extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h index 26018a4..a636ad3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_PROP_H @@ -49,6 +48,25 @@ typedef struct dsl_prop_cb_record { void *cbr_arg; } dsl_prop_cb_record_t; +typedef struct dsl_props_arg { + nvlist_t *pa_props; + zprop_source_t pa_source; +} dsl_props_arg_t; + +typedef struct dsl_prop_set_arg { + const char *psa_name; + zprop_source_t psa_source; + int psa_intsz; + int psa_numints; + const void *psa_value; + + /* + * Used to handle the special requirements of the quota and reservation + * properties. + */ + uint64_t psa_effective_value; +} dsl_prop_setarg_t; + int dsl_prop_register(struct dsl_dataset *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg); int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, @@ -59,18 +77,36 @@ int dsl_prop_get(const char *ddname, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_integer(const char *ddname, const char *propname, uint64_t *valuep, char *setpoint); -int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); +int dsl_prop_get_received(objset_t *os, nvlist_t **nvp); int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint); + int intsz, int numints, void *buf, char *setpoint, + boolean_t snapshot); dsl_syncfunc_t dsl_props_set_sync; int dsl_prop_set(const char *ddname, const char *propname, - int intsz, int numints, const void *buf); -int dsl_props_set(const char *dsname, nvlist_t *nvl); -void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx); + zprop_source_t source, int intsz, int numints, const void *buf); +int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); +void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + dmu_tx_t *tx); + +void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, + zprop_source_t source, uint64_t *value); +int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa); +#ifdef ZFS_DEBUG +void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa); +#define DSL_PROP_CHECK_PREDICTION(dd, psa) \ + dsl_prop_check_prediction((dd), (psa)) +#else +#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */ +#endif + +/* flag first receive on or after SPA_VERSION_RECVD_PROPS */ +boolean_t dsl_prop_get_hasrecvd(objset_t *os); +void dsl_prop_set_hasrecvd(objset_t *os); +void dsl_prop_unset_hasrecvd(objset_t *os); void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); void dsl_prop_nvlist_add_string(nvlist_t *nv, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h new file mode 100644 index 0000000..c79666e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DSL_SCAN_H +#define _SYS_DSL_SCAN_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +/* + * All members of this structure must be uint64_t, for byteswap + * purposes. + */ +typedef struct dsl_scan_phys { + uint64_t scn_func; /* pool_scan_func_t */ + uint64_t scn_state; /* dsl_scan_state_t */ + uint64_t scn_queue_obj; + uint64_t scn_min_txg; + uint64_t scn_max_txg; + uint64_t scn_cur_min_txg; + uint64_t scn_cur_max_txg; + uint64_t scn_start_time; + uint64_t scn_end_time; + uint64_t scn_to_examine; /* total bytes to be scanned */ + uint64_t scn_examined; /* bytes scanned so far */ + uint64_t scn_to_process; + uint64_t scn_processed; + uint64_t scn_errors; /* scan I/O error count */ + uint64_t scn_ddt_class_max; + ddt_bookmark_t scn_ddt_bookmark; + zbookmark_t scn_bookmark; + uint64_t scn_flags; /* dsl_scan_flags_t */ +} dsl_scan_phys_t; + +#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) + +typedef enum dsl_scan_flags { + DSF_VISIT_DS_AGAIN = 1<<0, +} dsl_scan_flags_t; + +typedef struct dsl_scan { + struct dsl_pool *scn_dp; + + boolean_t scn_pausing; + uint64_t scn_restart_txg; + uint64_t scn_sync_start_time; + zio_t *scn_zio_root; + + /* for debugging / information */ + uint64_t scn_visited_this_txg; + + dsl_scan_phys_t scn_phys; +} dsl_scan_t; + +int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_fini(struct dsl_pool *dp); +void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); +int dsl_scan_cancel(struct dsl_pool *); +int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); +void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx); +void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); +boolean_t dsl_scan_active(dsl_scan_t *scn); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SCAN_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h index 4995bfe..9126290 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_SYNCTASK_H #define _SYS_DSL_SYNCTASK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -38,7 +35,7 @@ extern "C" { struct dsl_pool; typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); typedef struct dsl_sync_task { list_node_t dst_node; @@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group { txg_node_t dstg_node; list_t dstg_tasks; struct dsl_pool *dstg_pool; - cred_t *dstg_cr; uint64_t dstg_txg; int dstg_err; int dstg_space; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index c77b772..583d630 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -36,9 +35,6 @@ extern "C" { #endif -typedef struct metaslab_class metaslab_class_t; -typedef struct metaslab_group metaslab_group_t; - extern space_map_ops_t *zfs_metaslab_ops; extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, @@ -58,14 +54,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops); +extern metaslab_class_t *metaslab_class_create(spa_t *spa, + space_map_ops_t *ops); extern void metaslab_class_destroy(metaslab_class_t *mc); -extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); -extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); +extern int metaslab_class_validate(metaslab_class_t *mc); + +extern void metaslab_class_space_update(metaslab_class_t *mc, + int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta, int64_t dspace_delta); +extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_space(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc); extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); extern void metaslab_group_destroy(metaslab_group_t *mg); +extern void metaslab_group_activate(metaslab_group_t *mg); +extern void metaslab_group_passivate(metaslab_group_t *mg); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 5f0b770..07988dd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -37,9 +37,14 @@ extern "C" { #endif struct metaslab_class { + spa_t *mc_spa; metaslab_group_t *mc_rotor; - uint64_t mc_allocated; space_map_ops_t *mc_ops; + uint64_t mc_aliquot; + uint64_t mc_alloc; /* total allocated space */ + uint64_t mc_deferred; /* total deferred frees */ + uint64_t mc_space; /* total space (alloc + free) */ + uint64_t mc_dspace; /* total deflated space */ }; struct metaslab_group { @@ -48,6 +53,7 @@ struct metaslab_group { uint64_t mg_aliquot; uint64_t mg_bonus_area; int64_t mg_bias; + int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; metaslab_group_t *mg_prev; @@ -67,7 +73,9 @@ struct metaslab { space_map_obj_t ms_smo_syncing; /* syncing space map object */ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ space_map_t ms_map; /* in-core free space map */ + int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h index e84b1bf..37a28b8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_REFCOUNT_H #define _SYS_REFCOUNT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include_next @@ -45,7 +42,7 @@ extern "C" { */ #define FTAG ((char *)__func__) -#if defined(DEBUG) || !defined(_KERNEL) +#ifdef ZFS_DEBUG typedef struct reference { list_node_t ref_link; void *ref_holder; @@ -72,11 +69,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag); int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); +void refcount_transfer(refcount_t *dst, refcount_t *src); void refcount_sysinit(void); void refcount_fini(void); -#else /* DEBUG */ +#else /* ZFS_DEBUG */ typedef struct refcount { uint64_t rc_count; @@ -93,11 +91,16 @@ typedef struct refcount { atomic_add_64_nv(&(rc)->rc_count, number) #define refcount_remove_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, -number) +#define refcount_transfer(dst, src) { \ + uint64_t __tmp = (src)->rc_count; \ + atomic_add_64(&(src)->rc_count, -__tmp); \ + atomic_add_64(&(dst)->rc_count, __tmp); \ +} #define refcount_sysinit() #define refcount_fini() -#endif /* DEBUG */ +#endif /* ZFS_DEBUG */ #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h new file mode 100644 index 0000000..e125201 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_SA_H +#define _SYS_SA_H + +#include +#include + +/* + * Currently available byteswap functions. + * If it all possible new attributes should used + * one of the already defined byteswap functions. + * If a new byteswap function is added then the + * ZPL/Pool version will need to be bumped. + */ + +typedef enum sa_bswap_type { + SA_UINT64_ARRAY, + SA_UINT32_ARRAY, + SA_UINT16_ARRAY, + SA_UINT8_ARRAY, + SA_ACL, +} sa_bswap_type_t; + +typedef uint16_t sa_attr_type_t; + +/* + * Attribute to register support for. + */ +typedef struct sa_attr_reg { + char *sa_name; /* attribute name */ + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; /* bswap functon enum */ + sa_attr_type_t sa_attr; /* filled in during registration */ +} sa_attr_reg_t; + + +typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, + boolean_t, void *userptr); + +/* + * array of attributes to store. + * + * This array should be treated as opaque/private data. + * The SA_BULK_ADD_ATTR() macro should be used for manipulating + * the array. + * + * When sa_replace_all_by_template() is used the attributes + * will be stored in the order defined in the array, except that + * the attributes may be split between the bonus and the spill buffer + * + */ +typedef struct sa_bulk_attr { + void *sa_data; + sa_data_locator_t *sa_data_func; + uint16_t sa_length; + sa_attr_type_t sa_attr; + /* the following are private to the sa framework */ + void *sa_addr; + uint16_t sa_buftype; + uint16_t sa_size; +} sa_bulk_attr_t; + + +/* + * special macro for adding entries for bulk attr support + * bulk - sa_bulk_attr_t + * count - integer that will be incremented during each add + * attr - attribute to manipulate + * func - function for accessing data. + * data - pointer to data. + * len - length of data + */ + +#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ +{ \ + b[idx].sa_attr = attr;\ + b[idx].sa_data_func = func; \ + b[idx].sa_data = data; \ + b[idx++].sa_length = len; \ +} + +typedef struct sa_os sa_os_t; + +typedef enum sa_handle_type { + SA_HDL_SHARED, + SA_HDL_PRIVATE +} sa_handle_type_t; + +struct sa_handle; +typedef void *sa_lookup_tab_t; +typedef struct sa_handle sa_handle_t; + +typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); + +int sa_handle_get(objset_t *, uint64_t, void *userp, + sa_handle_type_t, sa_handle_t **); +int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, + sa_handle_type_t, sa_handle_t **); +void sa_handle_destroy(sa_handle_t *); +int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); +void sa_buf_rele(dmu_buf_t *, void *); +int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); +int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, + uint32_t buflen, dmu_tx_t *); +int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); +int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); +int sa_size(sa_handle_t *, sa_attr_type_t, int *); +int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, + uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); +void sa_object_info(sa_handle_t *, dmu_object_info_t *); +void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); +void sa_update_user(sa_handle_t *, sa_handle_t *); +void *sa_get_userdata(sa_handle_t *); +void sa_set_userp(sa_handle_t *, void *); +dmu_buf_t *sa_get_db(sa_handle_t *); +uint64_t sa_handle_object(sa_handle_t *); +boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); +void sa_register_update_callback(objset_t *, sa_update_cb_t *); +int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **); +void sa_tear_down(objset_t *); +int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +boolean_t sa_enabled(objset_t *); +void sa_cache_init(); +void sa_cache_fini(); +int sa_set_sa_object(objset_t *, uint64_t); +int sa_hdrsize(void *); +void sa_handle_lock(sa_handle_t *); +void sa_handle_unlock(sa_handle_t *); + +#ifdef _KERNEL +int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h new file mode 100644 index 0000000..6661e47 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h @@ -0,0 +1,287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_SA_IMPL_H +#define _SYS_SA_IMPL_H + +#include +#include +#include + +/* + * Array of known attributes and their + * various characteristics. + */ +typedef struct sa_attr_table { + sa_attr_type_t sa_attr; + uint8_t sa_registered; + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; + char *sa_name; +} sa_attr_table_t; + +/* + * Zap attribute format for attribute registration + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | unused | len | bswap | attr num | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Zap attribute format for layout information. + * + * layout information is stored as an array of attribute numbers + * The name of the attribute is the layout number (0, 1, 2, ...) + * + * 16 0 + * +---- ---+ + * | attr # | + * +--------+ + * | attr # | + * +--- ----+ + * ...... + * + */ + +#define ATTR_BSWAP(x) BF32_GET(x, 16, 8) +#define ATTR_LENGTH(x) BF32_GET(x, 24, 16) +#define ATTR_NUM(x) BF32_GET(x, 0, 16) +#define ATTR_ENCODE(x, attr, length, bswap) \ +{ \ + BF64_SET(x, 24, 16, length); \ + BF64_SET(x, 16, 8, bswap); \ + BF64_SET(x, 0, 16, attr); \ +} + +#define TOC_OFF(x) BF32_GET(x, 0, 23) +#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) +#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) +#define TOC_ATTR_ENCODE(x, len_idx, offset) \ +{ \ + BF32_SET(x, 31, 1, 1); \ + BF32_SET(x, 24, 7, len_idx); \ + BF32_SET(x, 0, 24, offset); \ +} + +#define SA_LAYOUTS "LAYOUTS" +#define SA_REGISTRY "REGISTRY" + +/* + * Each unique layout will have their own table + * sa_lot (layout_table) + */ +typedef struct sa_lot { + avl_node_t lot_num_node; + avl_node_t lot_hash_node; + uint64_t lot_num; + uint64_t lot_hash; + sa_attr_type_t *lot_attrs; /* array of attr #'s */ + uint32_t lot_var_sizes; /* how many aren't fixed size */ + uint32_t lot_attr_count; /* total attr count */ + list_t lot_idx_tab; /* should be only a couple of entries */ + int lot_instance; /* used with lot_hash to identify entry */ +} sa_lot_t; + +/* index table of offsets */ +typedef struct sa_idx_tab { + list_node_t sa_next; + sa_lot_t *sa_layout; + uint16_t *sa_variable_lengths; + refcount_t sa_refcount; + uint32_t *sa_idx_tab; /* array of offsets */ +} sa_idx_tab_t; + +/* + * Since the offset/index information into the actual data + * will usually be identical we can share that information with + * all handles that have the exact same offsets. + * + * You would typically only have a large number of different table of + * contents if you had a several variable sized attributes. + * + * Two AVL trees are used to track the attribute layout numbers. + * one is keyed by number and will be consulted when a DMU_OT_SA + * object is first read. The second tree is keyed by the hash signature + * of the attributes and will be consulted when an attribute is added + * to determine if we already have an instance of that layout. Both + * of these tree's are interconnected. The only difference is that + * when an entry is found in the "hash" tree the list of attributes will + * need to be compared against the list of attributes you have in hand. + * The assumption is that typically attributes will just be updated and + * adding a completely new attribute is a very rare operation. + */ +struct sa_os { + kmutex_t sa_lock; + boolean_t sa_need_attr_registration; + boolean_t sa_force_spill; + uint64_t sa_master_obj; + uint64_t sa_reg_attr_obj; + uint64_t sa_layout_attr_obj; + int sa_num_attrs; + sa_attr_table_t *sa_attr_table; /* private attr table */ + sa_update_cb_t *sa_update_cb; + avl_tree_t sa_layout_num_tree; /* keyed by layout number */ + avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ + int sa_user_table_sz; + sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ +}; + +/* + * header for all bonus and spill buffers. + * The header has a fixed portion with a variable number + * of "lengths" depending on the number of variable sized + * attribues which are determined by the "layout number" + */ + +#define SA_MAGIC 0x2F505A /* ZFS SA */ +typedef struct sa_hdr_phys { + uint32_t sa_magic; + uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ + uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ + /* ... Data follows the lengths. */ +} sa_hdr_phys_t; + +/* + * sa_hdr_phys -> sa_layout_info + * + * 16 10 0 + * +--------+-------+ + * | hdrsz |layout | + * +--------+-------+ + * + * Bits 0-10 are the layout number + * Bits 11-16 are the size of the header. + * The hdrsize is the number * 8 + * + * For example. + * hdrsz of 1 ==> 8 byte header + * 2 ==> 16 byte header + * + */ + +#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ +{ \ + BF32_SET_SB(x, 10, 6, 3, 0, size); \ + BF32_SET(x, 0, 10, num); \ +} + +typedef enum sa_buf_type { + SA_BONUS = 1, + SA_SPILL = 2 +} sa_buf_type_t; + +typedef enum sa_data_op { + SA_LOOKUP, + SA_UPDATE, + SA_ADD, + SA_REPLACE, + SA_REMOVE +} sa_data_op_t; + +/* + * Opaque handle used for most sa functions + * + * This needs to be kept as small as possible. + */ + +struct sa_handle { + kmutex_t sa_lock; + dmu_buf_t *sa_bonus; + dmu_buf_t *sa_spill; + objset_t *sa_os; + void *sa_userp; + sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ + sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ +}; + +#define SA_GET_DB(hdl, type) \ + (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) + +#define SA_GET_HDR(hdl, type) \ + ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ + type))->db.db_data)) + +#define SA_IDX_TAB_GET(hdl, type) \ + (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) + +#define IS_SA_BONUSTYPE(a) \ + ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) + +#define SA_BONUSTYPE_FROM_DB(db) \ + (dmu_get_bonustype((dmu_buf_t *)db)) + +#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) + +#define SA_LAYOUT_NUM(x, type) \ + ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ + ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) + + +#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length + +#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ + hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ + SA_REGISTERED_LEN(sa, attr)) + +#define SA_SET_HDR(hdr, num, size) \ + { \ + hdr->sa_magic = SA_MAGIC; \ + SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ + } + +#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ + { \ + bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ + bulk.sa_buftype = type; \ + bulk.sa_addr = \ + (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ + (uintptr_t)hdr); \ +} + +#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ + (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ + (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ + sizeof (uint16_t), 8) : 0))) + +int sa_add_impl(sa_handle_t *, sa_attr_type_t, + uint32_t, sa_data_locator_t, void *, dmu_tx_t *); + +void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); +int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); + +void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, + uint16_t *, sa_hdr_phys_t *); + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index f54a5dc..23d48c8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_H @@ -43,8 +42,13 @@ extern "C" { typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; +typedef struct metaslab_group metaslab_group_t; +typedef struct metaslab_class metaslab_class_t; +typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; +typedef struct ddt ddt_t; +typedef struct ddt_entry ddt_entry_t; struct dsl_pool; /* @@ -134,15 +138,15 @@ typedef struct zio_cksum { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 9 | padding | + * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | birth txg | + * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -166,25 +170,29 @@ typedef struct zio_cksum { * cksum checksum function * comp compression function * G gang block indicator - * E endianness - * type DMU object type + * B byteorder (endianness) + * D dedup + * X unused * lvl level of indirection - * birth txg transaction group in which the block was born + * type DMU object type + * phys birth txg of block allocation; zero if same as logical birth txg + * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ -typedef struct blkptr { - dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ - uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[3]; /* Extra space for the future */ - uint64_t blk_birth; /* transaction group at birth */ - uint64_t blk_fill; /* fill count */ - zio_cksum_t blk_cksum; /* 256-bit checksum */ -} blkptr_t; - #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +typedef struct blkptr { + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_phys_birth; /* txg when block was allocated */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + /* * Macros to get and set fields in a bp or DVA. */ @@ -209,7 +217,6 @@ typedef struct blkptr { #define BP_GET_LSIZE(bp) \ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) - #define BP_SET_LSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) @@ -218,20 +225,35 @@ typedef struct blkptr { #define BP_SET_PSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) -#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) -#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) -#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) -#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) -#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) -#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) +#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1) +#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) -#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) -#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) +#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) +#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_PHYSICAL_BIRTH(bp) \ + ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + (bp)->blk_birth = (logical); \ + (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ +} #define BP_GET_ASIZE(bp) \ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ @@ -239,7 +261,7 @@ typedef struct blkptr { #define BP_GET_UCSIZE(bp) \ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ @@ -255,6 +277,12 @@ typedef struct blkptr { ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) +#define BP_EQUAL(bp1, bp2) \ + (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ + DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ + DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ + DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) + #define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ @@ -274,7 +302,10 @@ typedef struct blkptr { #define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) #define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) #define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) -#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +/* BP_IS_RAIDZ(bp) assumes no block compression */ +#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ + BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ @@ -287,14 +318,12 @@ typedef struct blkptr { (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ - (bp)->blk_pad[2] = 0; \ + (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } -#define BLK_FILL_ALREADY_FREED (-1ULL) - /* * Note: the byteorder is either 0 or -1, both of which are palindromes. * This simplifies the endianness handling a bit. @@ -309,27 +338,88 @@ typedef struct blkptr { #define BP_SPRINTF_LEN 320 +/* + * This macro allows code sharing between zfs, libzpool, and mdb. + * 'func' is either snprintf() or mdb_snprintf(). + * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. + */ +#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \ +{ \ + static const char *copyname[] = \ + { "zero", "single", "double", "triple" }; \ + int size = BP_SPRINTF_LEN; \ + int len = 0; \ + int copies = 0; \ + \ + if (bp == NULL) { \ + len = func(buf + len, size - len, ""); \ + } else if (BP_IS_HOLE(bp)) { \ + len = func(buf + len, size - len, ""); \ + } else { \ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ + const dva_t *dva = &bp->blk_dva[d]; \ + if (DVA_IS_VALID(dva)) \ + copies++; \ + len += func(buf + len, size - len, \ + "DVA[%d]=<%llu:%llx:%llx>%c", d, \ + (u_longlong_t)DVA_GET_VDEV(dva), \ + (u_longlong_t)DVA_GET_OFFSET(dva), \ + (u_longlong_t)DVA_GET_ASIZE(dva), \ + ws); \ + } \ + if (BP_IS_GANG(bp) && \ + DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ + DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ + copies--; \ + len += func(buf + len, size - len, \ + "[L%llu %s] %s %s %s %s %s %s%c" \ + "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ + "cksum=%llx:%llx:%llx:%llx", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + checksum, \ + compress, \ + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ + BP_IS_GANG(bp) ? "gang" : "contiguous", \ + BP_GET_DEDUP(bp) ? "dedup" : "unique", \ + copyname[copies], \ + ws, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)BP_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth, \ + (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)bp->blk_fill, \ + ws, \ + (u_longlong_t)bp->blk_cksum.zc_word[0], \ + (u_longlong_t)bp->blk_cksum.zc_word[1], \ + (u_longlong_t)bp->blk_cksum.zc_word[2], \ + (u_longlong_t)bp->blk_cksum.zc_word[3]); \ + } \ + ASSERT(len < size); \ +} + #include #define BP_GET_BUFC_TYPE(bp) \ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ ARC_BUFC_METADATA : ARC_BUFC_DATA); -/* - * Routines found in spa.c - */ + +typedef enum spa_import_type { + SPA_IMPORT_EXISTING, + SPA_IMPORT_ASSEMBLE +} spa_import_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_open_rewind(const char *pool, spa_t **, void *tag, + nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); -extern int spa_check_rootconf(char *devpath, char *devid, - nvlist_t **bestconf, uint64_t *besttxg); -extern boolean_t spa_rootdev_validate(nvlist_t *nv); extern int spa_import_rootpool(char *devpath, char *devid); -extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); -extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *); +extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, + uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, @@ -341,12 +431,23 @@ extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); +extern void spa_scan_stat_init(spa_t *spa); +extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 +#define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 + +/* + * Controls the behavior of spa_vdev_remove(). + */ +#define SPA_REMOVE_UNSPARE 0x01 +#define SPA_REMOVE_DONE 0x02 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); @@ -355,8 +456,11 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); +extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); @@ -370,15 +474,23 @@ extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); -extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc); -/* scrubbing */ -extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); +/* scanning */ +extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_stop(spa_t *spa); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); +/* + * DEFERRED_FREE must be large enough that regular blocks are not + * deferred. XXX so can't we change it back to 1? + */ +#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */ +#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ +#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ + /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; @@ -396,7 +508,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); -extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); /* * Miscellaneous SPA routines in spa_misc.c @@ -404,7 +515,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); -extern spa_t *spa_add(const char *name, const char *altroot); +extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); @@ -413,6 +524,7 @@ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); +#define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ @@ -432,12 +544,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_config_enter(spa_t *spa); +extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, + int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ -extern void spa_vdev_state_enter(spa_t *spa); +extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); +/* Log state */ +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +extern spa_log_state_t spa_get_log_state(spa_t *spa); +extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); +extern int spa_offline_log(spa_t *spa); + +/* Log claim callback */ +extern void spa_claim_notify(zio_t *zio); + /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); @@ -449,36 +579,49 @@ extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); +extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); -extern uint64_t spa_get_alloc(spa_t *spa); -extern uint64_t spa_get_space(spa_t *spa); -extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_dspace(spa_t *spa); +extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); +extern boolean_t spa_deflate(spa_t *spa); +extern metaslab_class_t *spa_normal_class(spa_t *spa); +extern metaslab_class_t *spa_log_class(spa_t *spa); extern int spa_max_replication(spa_t *spa); +extern int spa_prev_software_version(spa_t *spa); extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); +extern uint64_t spa_bootfs(spa_t *spa); +extern uint64_t spa_delegation(spa_t *spa); +extern objset_t *spa_meta_objset(spa_t *spa); /* Miscellaneous support routines */ extern int spa_rename(const char *oldname, const char *newname); +extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); -extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); +extern uint64_t spa_generate_guid(spa_t *spa); +extern void sprintf_blkptr(char *buf, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); -extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); +extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); +extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); +extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); + extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); #define strtonum(str, nptr) zfs_strtonum((str), (nptr)) @@ -491,10 +634,11 @@ typedef enum history_log_type { } history_log_type_t; typedef struct history_arg { - const char *ha_history_str; + char *ha_history_str; history_log_type_t ha_log_type; history_internal_events_t ha_event; - char ha_zone[MAXPATHLEN]; + char *ha_zone; + uid_t ha_uid; } history_arg_t; extern char *spa_his_ievent_table[]; @@ -504,16 +648,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, history_log_type_t what); -void spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_log_internal(history_internal_events_t event, + spa_t *spa, dmu_tx_t *tx, const char *fmt, ...); +extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); /* error handling */ struct zbookmark; -struct zio; -extern void spa_log_error(spa_t *spa, struct zio *zio); +extern void spa_log_error(spa_t *spa, zio_t *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t stateoroffset, uint64_t length); + zio_t *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); @@ -544,7 +689,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + sprintf_blkptr(__blkbuf, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h index b56073b..1d3622f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -36,7 +34,6 @@ extern "C" { extern char *spa_get_bootprop(char *prop); extern void spa_free_bootprop(char *prop); -extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index ecb065c..a2f15d2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -36,6 +35,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -78,13 +78,6 @@ typedef struct spa_config_dirent { char *scd_path; } spa_config_dirent_t; -typedef enum spa_log_state { - SPA_LOG_UNKNOWN = 0, /* unknown log state */ - SPA_LOG_MISSING, /* missing log(s) */ - SPA_LOG_CLEAR, /* clear the log(s) */ - SPA_LOG_GOOD, /* log(s) are good */ -} spa_log_state_t; - enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE_HIGH, @@ -93,6 +86,25 @@ enum zio_taskq_type { ZIO_TASKQ_TYPES }; +/* + * State machine for the zpool-pooname process. The states transitions + * are done as follows: + * + * From To Routine + * PROC_NONE -> PROC_CREATED spa_activate() + * PROC_CREATED -> PROC_ACTIVE spa_thread() + * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() + * PROC_DEACTIVATE -> PROC_GONE spa_thread() + * PROC_GONE -> PROC_NONE spa_deactivate() + */ +typedef enum spa_proc_state { + SPA_PROC_NONE, /* spa_proc = &p0, no process created */ + SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ + SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ + SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ + SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ +} spa_proc_state_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -101,13 +113,15 @@ struct spa { avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ + nvlist_t *spa_config_splitting; /* config for splitting */ + nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ - boolean_t spa_load_verbatim; /* load the given config? */ + uint64_t spa_import_flags; /* import specific flags */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ @@ -115,6 +129,9 @@ struct spa { uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ + uint64_t spa_load_max_txg; /* best initial ub_txg */ + uint64_t spa_claim_max_txg; /* highest claimed birth txg */ + timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ @@ -124,21 +141,24 @@ struct spa { spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ - uint64_t spa_sync_bplist_obj; /* object for deferred frees */ - bplist_t spa_sync_bplist; /* deferred-free bplist */ + bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ + bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ + boolean_t spa_extreme_rewind; /* rewind past deferred frees */ + uint64_t spa_last_io; /* lbolt of last non-scan I/O */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ - uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ - uint64_t spa_scrub_errors; /* scrub I/O error count */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + uint64_t spa_scan_pass_start; /* start time per pass/reboot */ + uint64_t spa_scan_pass_exam; /* examined bytes per pass */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ @@ -146,7 +166,14 @@ struct spa { uint16_t spa_async_tasks; /* async task mask */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ - boolean_t spa_last_open_failed; /* true if last open faled */ + int spa_last_open_failed; /* error if last open failed */ + uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ + uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_txg; /* ub txg that loaded */ + uint64_t spa_load_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_meta_errors; /* verify metadata err count */ + uint64_t spa_load_data_errors; /* verify data err count */ + uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ @@ -168,10 +195,27 @@ struct spa { kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ + uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ int spa_mode; /* FREAD | FWRITE */ spa_log_state_t spa_log_state; /* log state */ + uint64_t spa_autoexpand; /* lun expansion on/off */ + ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ + uint64_t spa_ddt_stat_object; /* DDT statistics */ + uint64_t spa_dedup_ditto; /* dedup ditto threshold */ + uint64_t spa_dedup_checksum; /* default dedup checksum */ + uint64_t spa_dspace; /* dspace in normal class */ + kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ + kmutex_t spa_proc_lock; /* protects spa_proc* */ + kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ + spa_proc_state_t spa_proc_state; /* see definition */ + struct proc *spa_proc; /* "zpool-poolname" process */ + uint64_t spa_did; /* if procp != p0, did of t1 */ + boolean_t spa_autoreplace; /* autoreplace set in open */ + int spa_vdev_locks; /* locks grabbed */ + uint64_t spa_creation_version; /* version at pool creation */ + uint64_t spa_prev_software_version; /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. @@ -180,16 +224,13 @@ struct spa { */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ refcount_t spa_refcount; /* number of opens */ +#ifndef sun + boolean_t spa_splitting_newspa; /* creating new spa in split */ +#endif }; extern const char *spa_config_path; -#define BOOTFS_COMPRESS_VALID(compress) \ - ((compress) == ZIO_COMPRESS_LZJB || \ - ((compress) == ZIO_COMPRESS_ON && \ - ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ - (compress) == ZIO_COMPRESS_OFF) - #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h index 23bdff2..e323d5e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_TXG_H #define _SYS_TXG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -41,6 +39,9 @@ extern "C" { #define TXG_INITIAL TXG_SIZE /* initial txg */ #define TXG_IDX (txg & TXG_MASK) +/* Number of txgs worth of frees we defer adding to in-core spacemaps */ +#define TXG_DEFER_SIZE 2 + #define TXG_WAIT 1ULL #define TXG_NOWAIT 2ULL @@ -71,8 +72,7 @@ extern void txg_sync_stop(struct dsl_pool *dp); extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); extern void txg_rele_to_quiesce(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp); -extern void txg_suspend(struct dsl_pool *dp); -extern void txg_resume(struct dsl_pool *dp); +extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); /* * Delay the caller by the specified number of ticks or until @@ -117,6 +117,7 @@ extern void txg_list_create(txg_list_t *tl, size_t offset); extern void txg_list_destroy(txg_list_t *tl); extern int txg_list_empty(txg_list_t *tl, uint64_t txg); extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h index 7413c66..7b356ea 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,13 +37,13 @@ struct tx_cpu { kmutex_t tc_lock; kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; + list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ char tc_pad[16]; }; typedef struct tx_state { tx_cpu_t *tx_cpu; /* protects right to enter txg */ kmutex_t tx_sync_lock; /* protects tx_state_t */ - krwlock_t tx_suspend; uint64_t tx_open_txg; /* currently open txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ @@ -64,6 +64,8 @@ typedef struct tx_state { kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; + + taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ } tx_state_t; #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h index 93d936a..b5bb915 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,19 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_UBERBLOCK_H #define _SYS_UBERBLOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include -#include #ifdef __cplusplus extern "C" { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h index b49df8a..6ab6aa3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_UBERBLOCK_IMPL_H @@ -33,11 +32,6 @@ extern "C" { #endif /* - * For zdb use and debugging purposes only - */ -extern uint64_t ub_max_txg; - -/* * The uberblock version is incremented whenever an incompatible on-disk * format change is made to the SPA, DMU, or ZAP. * @@ -57,6 +51,9 @@ struct uberblock { uint64_t ub_guid_sum; /* sum of all vdev guids */ uint64_t ub_timestamp; /* UTC time of last sync */ blkptr_t ub_rootbp; /* MOS objset_phys_t */ + + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; }; #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index 9332554..941f234 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -47,10 +46,11 @@ typedef enum vdev_dtl_type { extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); +extern void vdev_open_children(vdev_t *); +extern boolean_t vdev_uses_zvols(vdev_t *); extern int vdev_validate(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); -extern void vdev_init(vdev_t *, uint64_t txg); extern void vdev_reopen(vdev_t *); extern int vdev_validate_aux(vdev_t *vd); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); @@ -69,26 +69,31 @@ extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); +extern void vdev_hold(vdev_t *); +extern void vdev_rele(vdev_t *); + extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); +extern void vdev_metaslab_set_size(vdev_t *); +extern void vdev_expand(vdev_t *vd, uint64_t txg); +extern void vdev_split(vdev_t *vd); + extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); -extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, - boolean_t complete); -extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); +extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); -extern void vdev_space_update(vdev_t *vd, int64_t space_delta, - int64_t alloc_delta, boolean_t update_root); +extern void vdev_space_update(vdev_t *vd, + int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); -extern int vdev_fault(spa_t *spa, uint64_t guid); -extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); +extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); @@ -119,8 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); +typedef enum vdev_config_flag { + VDEV_CONFIG_SPARE = 1 << 0, + VDEV_CONFIG_L2CACHE = 1 << 1, + VDEV_CONFIG_REMOVING = 1 << 2 +} vdev_config_flag_t; + +extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, boolean_t isspare, boolean_t isl2cache); + boolean_t getstats, vdev_config_flag_t flags); /* * Label routines @@ -136,7 +148,8 @@ typedef enum { VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ VDEV_LABEL_REMOVE, /* remove an existing device */ - VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ + VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ + VDEV_LABEL_SPLIT /* generating new label for split-off dev */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 93e4102..7efa3f3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -62,6 +61,8 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef int vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); +typedef void vdev_hold_func_t(vdev_t *vd); +typedef void vdev_rele_func_t(vdev_t *vd); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -70,6 +71,8 @@ typedef struct vdev_ops { vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; + vdev_hold_func_t *vdev_op_hold; + vdev_rele_func_t *vdev_op_rele; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -112,19 +115,28 @@ struct vdev { uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ + vnode_t *vdev_name_vp; /* vnode for pathname */ + vnode_t *vdev_devid_vp; /* vnode for devid */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ + boolean_t vdev_expanding; /* expand the vdev? */ + boolean_t vdev_reopening; /* reopen in progress? */ + int vdev_open_error; /* error on last open */ + kthread_t *vdev_open_thread; /* thread opening children */ + uint64_t vdev_crtxg; /* txg when top-level was added */ /* * Top-level vdev state. @@ -139,10 +151,12 @@ struct vdev { txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ + uint64_t vdev_removing; /* device is being removed? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ + uint64_t vdev_ishole; /* is a hole in the namespace */ /* * Leaf vdev state. @@ -155,6 +169,7 @@ struct vdev { uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ + uint64_t vdev_resilvering; /* persistent resilvering state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ @@ -166,6 +181,8 @@ struct vdev { boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ + boolean_t vdev_splitting; /* split or repair in progress */ + boolean_t vdev_delayed_close; /* delayed device close? */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint8_t vdev_cant_read; /* vdev is failing all reads */ @@ -176,6 +193,7 @@ struct vdev { vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ + vdev_aux_t vdev_label_aux; /* on-disk aux state */ /* * For DTrace to work in userland (libzpool) context, these fields must @@ -189,6 +207,8 @@ struct vdev { kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; +#define VDEV_RAIDZ_MAXPARITY 3 + #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_pad2) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 @@ -204,8 +224,8 @@ struct vdev { #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { - char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; - zio_block_tail_t vp_zbt; + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; + zio_eck_t vp_zbt; } vdev_phys_t; typedef struct vdev_label { @@ -239,10 +259,14 @@ typedef struct vdev_label { #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 +#define VDEV_ALLOC_ROOTPOOL 4 +#define VDEV_ALLOC_SPLIT 5 /* * Allocate or free a vdev */ +extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, + vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); @@ -259,7 +283,8 @@ extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ -extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv); +extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); +extern boolean_t vdev_log_state_valid(vdev_t *vd); extern void vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); @@ -279,13 +304,15 @@ extern vdev_ops_t vdev_disk_ops; #endif extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; /* * Common size functions */ extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); -extern uint64_t vdev_get_rsize(vdev_t *vd); +extern uint64_t vdev_get_min_asize(vdev_t *vd); +extern void vdev_set_min_asize(vdev_t *vd); /* * zdb uses this tunable, so it must be declared here to make lint happy. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h index ea3a0f6..a1130bb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_H #define _SYS_ZAP_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZAP - ZFS Attribute Processor * @@ -87,9 +84,6 @@ extern "C" { #endif -#define ZAP_MAXNAMELEN 256 -#define ZAP_MAXVALUELEN 1024 - /* * The matchtype specifies which entry will be accessed. * MT_EXACT: only find an exact match (non-normalized) @@ -106,6 +100,18 @@ typedef enum matchtype MT_FIRST } matchtype_t; +typedef enum zap_flags { + /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ + ZAP_FLAG_HASH64 = 1 << 0, + /* Key is binary, not string (zap_add_uint64() can be used) */ + ZAP_FLAG_UINT64_KEY = 1 << 1, + /* + * First word of key (which must be an array of uint64) is + * already randomly distributed. + */ + ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, +} zap_flags_t; + /* * Create a new zapobj with no attributes and return its object number. * MT_EXACT will cause the zap object to only support MT_EXACT lookups, @@ -123,6 +129,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) @@ -185,6 +194,11 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); +int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); +int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints); int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, uint64_t *towrite, uint64_t *tooverwrite); @@ -195,9 +209,12 @@ int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, * If an attribute with the given name already exists, the call will * fail and return EEXIST. */ -int zap_add(objset_t *ds, uint64_t zapobj, const char *name, +int zap_add(objset_t *ds, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an @@ -209,6 +226,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified @@ -219,6 +239,8 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers); +int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers); /* * Remove the specified attribute. @@ -229,6 +251,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx); +int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap @@ -236,7 +260,6 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); - /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string @@ -253,6 +276,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj, */ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); +/* Same as zap_join, but set the values to 'value'. */ +int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx); + +/* Same as zap_join, but add together any duplicated entries. */ +int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx); + /* * Manipulate entries where the name + value are the "same" (the name is * a stringified version of the value). @@ -260,6 +291,23 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); + +/* Here the key is an int and the value is a different int. */ +int zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t *valuep); + +/* + * They name is a stringified version of key; increment its value by + * delta. Zero values will be zap_remove()-ed. + */ +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); +int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx); struct zap; struct zap_leaf; @@ -269,6 +317,7 @@ typedef struct zap_cursor { struct zap *zc_zap; struct zap_leaf *zc_leaf; uint64_t zc_zapobj; + uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; } zap_cursor_t; @@ -320,6 +369,11 @@ void zap_cursor_advance(zap_cursor_t *zc); uint64_t zap_cursor_serialize(zap_cursor_t *zc); /* + * Advance the cursor to the attribute having the given key. + */ +int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); + +/* * Initialize a zap cursor pointing to the position recorded by * zap_cursor_serialize (in the "serialized" argument). You can also * use a "serialized" argument of 0 to start at the beginning of the diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h index c86bb16..1dc322e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_IMPL_H @@ -40,13 +39,13 @@ extern int fzap_default_block_shift; #define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) -#define ZAP_MAXCD (uint32_t)(-1) -#define ZAP_HASHBITS 28 #define MZAP_ENT_LEN 64 #define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) #define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT #define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) +#define ZAP_NEED_CD (-1U) + typedef struct mzap_ent_phys { uint64_t mze_value; uint32_t mze_cd; @@ -67,9 +66,11 @@ typedef struct mzap_ent { avl_node_t mze_node; int mze_chunkid; uint64_t mze_hash; - mzap_ent_phys_t mze_phys; + uint32_t mze_cd; /* copy from mze_phys->mze_cd */ } mzap_ent_t; +#define MZE_PHYS(zap, mze) \ + (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid]) /* * The (fat) zap is stored in one object. It is an array of @@ -127,6 +128,7 @@ typedef struct zap_phys { uint64_t zap_num_entries; /* number of entries */ uint64_t zap_salt; /* salt to stir into hash function */ uint64_t zap_normflags; /* flags for u8_textprep_str() */ + uint64_t zap_flags; /* zap_flags_t */ /* * This structure is followed by padding, and then the embedded * pointer table. The embedded pointer table takes up second @@ -168,10 +170,13 @@ typedef struct zap { typedef struct zap_name { zap_t *zn_zap; - const char *zn_name_orij; + int zn_key_intlen; + const void *zn_key_orig; + int zn_key_orig_numints; + const void *zn_key_norm; + int zn_key_norm_numints; uint64_t zn_hash; matchtype_t zn_matchtype; - const char *zn_name_norm; char zn_normbuf[ZAP_MAXNAMELEN]; } zap_name_t; @@ -183,8 +188,11 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); void zap_evict(dmu_buf_t *db, void *vmzap); -zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt); +zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); +int zap_hashbits(zap_t *zap); +uint32_t zap_maxcd(zap_t *zap); +uint64_t zap_getflags(zap_t *zap); #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) @@ -193,6 +201,7 @@ int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); +void fzap_prefetch(zap_name_t *zn); int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, uint64_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, @@ -209,7 +218,8 @@ void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, dmu_tx_t *tx); -void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); +int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h index 14144e0..3a33636 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h @@ -19,20 +19,21 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_LEAF_H #define _SYS_ZAP_LEAF_H -#pragma ident "%Z%%M% %I% %E% SMI" +#include #ifdef __cplusplus extern "C" { #endif struct zap; +struct zap_name; +struct zap_stats; #define ZAP_LEAF_MAGIC 0x2AB1EAF @@ -129,12 +130,12 @@ typedef struct zap_leaf_phys { typedef union zap_leaf_chunk { struct zap_leaf_entry { uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ - uint8_t le_int_size; /* size of ints */ + uint8_t le_value_intlen; /* size of value's ints */ uint16_t le_next; /* next entry in hash chain */ uint16_t le_name_chunk; /* first chunk of the name */ - uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_name_numints; /* ints in name (incl null) */ uint16_t le_value_chunk; /* first chunk of the value */ - uint16_t le_value_length; /* value length in ints */ + uint16_t le_value_numints; /* value length in ints */ uint32_t le_cd; /* collision differentiator */ uint64_t le_hash; /* hash value of the name */ } l_entry; @@ -177,7 +178,7 @@ typedef struct zap_entry_handle { * value must equal zap_hash(name). */ extern int zap_leaf_lookup(zap_leaf_t *l, - zap_name_t *zn, zap_entry_handle_t *zeh); + struct zap_name *zn, zap_entry_handle_t *zeh); /* * Return a handle to the entry with this hash+cd, or the entry with the @@ -193,10 +194,10 @@ extern int zap_leaf_lookup_closest(zap_leaf_t *l, * num_integers in the attribute. */ extern int zap_entry_read(const zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, void *buf); + uint8_t integer_size, uint64_t num_integers, void *buf); -extern int zap_entry_read_name(const zap_entry_handle_t *zeh, - uint16_t buflen, char *buf); +extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); /* * Replace the value of an existing entry. @@ -204,7 +205,7 @@ extern int zap_entry_read_name(const zap_entry_handle_t *zeh, * zap_entry_update may fail if it runs out of space (ENOSPC). */ extern int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf); + uint8_t integer_size, uint64_t num_integers, const void *buf); /* * Remove an entry. @@ -216,17 +217,16 @@ extern void zap_entry_remove(zap_entry_handle_t *zeh); * belong in this leaf (according to its hash value). Fills in the * entry handle on success. Returns 0 on success or ENOSPC on failure. */ -extern int zap_entry_create(zap_leaf_t *l, - const char *name, uint64_t h, uint32_t cd, - uint8_t integer_size, uint64_t num_integers, const void *buf, - zap_entry_handle_t *zeh); +extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); /* * Return true if there are additional entries with the same normalized * form. */ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, - zap_name_t *zn, const char *name, zap_t *zap); + struct zap_name *zn, const char *name, struct zap *zap); /* * Other stuff. @@ -235,7 +235,8 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); -extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); +extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, + struct zap_stats *zs); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h index ea15095..d3c471a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_ACL_H @@ -32,6 +31,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -105,12 +105,18 @@ typedef struct zfs_acl_phys_v0 { #define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) +/* + * Size of ACL count is always 2 bytes. + * Necessary to for dealing with both V0 ACL and V1 ACL layout + */ +#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t)) + typedef struct zfs_acl_phys { uint64_t z_acl_extern_obj; /* ext acl pieces */ uint32_t z_acl_size; /* Number of bytes in ACL */ uint16_t z_acl_version; /* acl version */ uint16_t z_acl_count; /* ace count */ - uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ } zfs_acl_phys_t; typedef struct acl_ops { @@ -145,21 +151,26 @@ typedef struct zfs_acl_node { void *z_allocdata; /* pointer to kmem allocated memory */ size_t z_allocsize; /* Size of blob in bytes */ size_t z_size; /* length of ACL data */ - int z_ace_count; /* number of ACEs in this acl node */ + uint64_t z_ace_count; /* number of ACEs in this acl node */ int z_ace_idx; /* ace iterator positioned on */ } zfs_acl_node_t; typedef struct zfs_acl { - int z_acl_count; /* Number of ACEs */ + uint64_t z_acl_count; /* Number of ACEs */ size_t z_acl_bytes; /* Number of bytes in ACL */ uint_t z_version; /* version of ACL */ void *z_next_ace; /* pointer to next ACE */ - int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ list_t z_acl; /* chunks of ACE data */ acl_ops_t z_ops; /* ACL operations */ } zfs_acl_t; +typedef struct acl_locator_cb { + zfs_acl_t *cb_aclp; + zfs_acl_node_t *cb_acl_node; +} zfs_acl_locator_cb_t; + #define ACL_DATA_ALLOCED 0x1 #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) @@ -206,7 +217,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); -int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); @@ -214,11 +225,20 @@ void zfs_acl_free(zfs_acl_t *); int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, struct zfs_fuid_info **, zfs_acl_t **); int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); +uint64_t zfs_external_acl(struct znode *); +int zfs_znode_acl_version(struct znode *); +int zfs_acl_size(struct znode *, int *); +zfs_acl_t *zfs_acl_alloc(int); +zfs_acl_node_t *zfs_acl_node_alloc(size_t); +void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *); +void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, + uint64_t *, uint64_t, uint64_t); +int zfs_acl_chown_setattr(struct znode *); #endif #ifdef __cplusplus } #endif - -#endif /* !ZFS_NO_ACL */ +#endif /* _SYS_FS_ZFS_ACL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h index 952bb24..6dc163d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -54,6 +52,8 @@ extern "C" { #include #include #include +#include +#include #include #include #include @@ -83,10 +83,11 @@ extern "C" { #include #include #include -#include +#include #include #include #include +#include #include diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h index 450ac1c..50ecf9b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H #define _SYS_ZFS_DEBUG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func, extern void zfs_panic_recover(const char *fmt, ...); +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +extern void zfs_dbgmsg_init(void); +extern void zfs_dbgmsg_fini(void); +extern void zfs_dbgmsg(const char *fmt, ...); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h index bd2c938..349f8ef 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_DIR_H #define _SYS_FS_ZFS_DIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, pathname_t *); extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, - uint_t, znode_t **, int, zfs_acl_ids_t *); + uint_t, znode_t **, zfs_acl_ids_t *); extern void zfs_rmnode(znode_t *); extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h index c035707..b381bb9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #include #endif #include +#include #ifdef __cplusplus extern "C" { @@ -100,6 +101,8 @@ typedef struct zfs_fuid_info { #ifdef _KERNEL struct znode; extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t, + uint64_t, uint64_t, zfs_fuid_type_t); extern void zfs_fuid_destroy(zfsvfs_t *); extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, cred_t *, zfs_fuid_info_t **); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index bf107d6..63b9c57 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -19,19 +19,18 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include #include +#include +#include #ifdef _KERNEL #include @@ -47,26 +46,86 @@ extern "C" { #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 -#define DMU_BACKUP_STREAM_VERSION (1ULL) -#define DMU_BACKUP_HEADER_VERSION (2ULL) +/* + * Field manipulation macros for the drr_versioninfo field of the + * send stream header. + */ + +/* + * Header types for zfs send streams. + */ +typedef enum drr_headertype { + DMU_SUBSTREAM = 0x1, + DMU_COMPOUNDSTREAM = 0x2 +} drr_headertype_t; + +#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) +#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) + +#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) +#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) + +/* + * Feature flags for zfs send streams (flags in drr_versioninfo) + */ + +#define DMU_BACKUP_FEATURE_DEDUP (0x1) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2) +#define DMU_BACKUP_FEATURE_SA_SPILL (0x4) + +/* + * Mask of all supported backup features + */ +#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ + DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL) + +/* Are all features in the given flag word currently supported? */ +#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) + +/* + * The drr_versioninfo field of the dmu_replay_record has the + * following layout: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | reserved | feature-flags |C|S| + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * The low order two bits indicate the header type: SUBSTREAM (0x1) + * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: + * this field used to be a version number, where the two version types + * were 1 and 2. Using two bits for this allows earlier versions of + * the code to be able to recognize send streams that don't use any + * of the features indicated by feature flags. + */ + #define DMU_BACKUP_MAGIC 0x2F5bacbacULL #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* + * flags in the drr_checksumflags field in the DRR_WRITE and + * DRR_WRITE_BYREF blocks + */ +#define DRR_CHECKSUM_DEDUP (1<<0) + +#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) + +/* * zfs ioctl command structure */ typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, - DRR_WRITE, DRR_FREE, DRR_END, + DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, + DRR_SPILL, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin { uint64_t drr_magic; - uint64_t drr_version; + uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; @@ -76,6 +135,7 @@ typedef struct dmu_replay_record { } drr_begin; struct drr_end { zio_cksum_t drr_checksum; + uint64_t drr_toguid; } drr_end; struct drr_object { uint64_t drr_object; @@ -83,14 +143,16 @@ typedef struct dmu_replay_record { dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; - uint8_t drr_checksum; + uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_pad[6]; + uint64_t drr_toguid; /* bonus content follows */ } drr_object; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; + uint64_t drr_toguid; } drr_freeobjects; struct drr_write { uint64_t drr_object; @@ -98,16 +160,61 @@ typedef struct dmu_replay_record { uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_length; + uint64_t drr_toguid; + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ /* content follows */ } drr_write; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; + uint64_t drr_toguid; } drr_free; + struct drr_write_byref { + /* where to put the data */ + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + /* where to find the prior copy of the data */ + uint64_t drr_refguid; + uint64_t drr_refobject; + uint64_t drr_refoffset; + /* properties of the data */ + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ + } drr_write_byref; + struct drr_spill { + uint64_t drr_object; + uint64_t drr_length; + uint64_t drr_toguid; + uint64_t drr_pad[4]; /* needed for crypto */ + /* spill data follows */ + } drr_spill; } drr_u; } dmu_replay_record_t; +/* diff record range types */ +typedef enum diff_type { + DDR_NONE = 0x1, + DDR_INUSE = 0x2, + DDR_FREE = 0x4 +} diff_type_t; + +/* + * The diff reports back ranges of free or in-use objects. + */ +typedef struct dmu_diff_record { + uint64_t ddr_type; + uint64_t ddr_first; + uint64_t ddr_last; +} dmu_diff_record_t; + typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; @@ -119,6 +226,10 @@ typedef struct zinject_record { uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; } zinject_record_t; #define ZINJECT_NULL 0x1 @@ -146,8 +257,9 @@ typedef enum zfs_case { typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; - char zc_value[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; @@ -162,11 +274,21 @@ typedef struct zfs_cmd { uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_pad[4]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; } zfs_cmd_t; typedef struct zfs_useracct { @@ -176,8 +298,10 @@ typedef struct zfs_useracct { uint64_t zu_space; } zfs_useracct_t; -#define ZVOL_MAX_MINOR (1 << 16) -#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) +#define ZFSDEV_MAX_MINOR (1 << 16) +#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) + +#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 #ifdef _KERNEL @@ -191,7 +315,29 @@ extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); -extern int zfs_unmount_snap(char *, void *); +extern int zfs_unmount_snap(const char *, void *); + +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +extern void *zfsdev_get_soft_state(minor_t minor, + enum zfs_soft_state_type which); +extern minor_t zfsdev_minor_alloc(void); + +extern void *zfsdev_state; +extern kmutex_t zfsdev_state_lock; #endif /* _KERNEL */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h new file mode 100644 index 0000000..4982bd4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZFS_ONEXIT_H +#define _SYS_ZFS_ONEXIT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct zfs_onexit { + kmutex_t zo_lock; + list_t zo_actions; +} zfs_onexit_t; + +typedef struct zfs_onexit_action_node { + list_node_t za_link; + void (*za_func)(void *); + void *za_data; +} zfs_onexit_action_node_t; + +extern void zfs_onexit_init(zfs_onexit_t **zo); +extern void zfs_onexit_destroy(zfs_onexit_t *zo); + +#endif + +extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); +extern void zfs_onexit_fd_rele(int fd); +extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle); +extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, + boolean_t fire); +extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, + void **data); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_ONEXIT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h new file mode 100644 index 0000000..fc40b0e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_SA_H +#define _SYS_ZFS_SA_H + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include + + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the list of known attributes + * to the ZPL. The values of the actual + * attributes are not defined by the order + * the enums. It is controlled by the attribute + * registration mechanism. Two different file system + * could have different numeric values for the same + * attributes. this list is only used for dereferencing + * into the table that will hold the actual numeric value. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_END +} zpl_attr_t; + +#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108 +#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \ + sizeof (zfs_acl_phys_t)) + +#define SA_MODE_OFFSET 0 +#define SA_SIZE_OFFSET 8 +#define SA_GEN_OFFSET 16 +#define SA_UID_OFFSET 24 +#define SA_GID_OFFSET 32 +#define SA_PARENT_OFFSET 40 + +extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1]; +extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1]; + +/* + * This is a deprecated data structure that only exists for + * dealing with file systems create prior to ZPL version 5. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +} znode_phys_t; + +#ifdef _KERNEL +int zfs_sa_readlink(struct znode *, uio_t *); +void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); +void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); +void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); +void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); +void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_SA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h new file mode 100644 index 0000000..a8af7ec --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_STAT_H +#define _SYS_FS_ZFS_STAT_H + +#ifdef _KERNEL +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * A limited number of zpl level stats are retrievable + * with an ioctl. zfs diff is the current consumer. + */ +typedef struct zfs_stat { + uint64_t zs_gen; + uint64_t zs_mode; + uint64_t zs_links; + uint64_t zs_ctime[2]; +} zfs_stat_t; + +extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_STAT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h index 163a800..c328a03 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H @@ -29,6 +28,7 @@ #include #include #include +#include #include #include @@ -37,6 +37,7 @@ extern "C" { #endif typedef struct zfsvfs zfsvfs_t; +struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ @@ -54,7 +55,6 @@ struct zfsvfs { boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ - uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ @@ -71,12 +71,14 @@ struct zfsvfs { boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ - kmutex_t z_online_recv_lock; /* held while recv in progress */ + boolean_t z_use_sa; /* version allow system attributes */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ }; @@ -132,19 +134,23 @@ typedef struct zfid_long { extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; -extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); -extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname); extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valuep); extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota); -extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs, - boolean_t isgroup, uint64_t fuid); +extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, + boolean_t isgroup); +extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, + uint64_t fuid); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); -extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp); +extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp); extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); +extern int zfs_vnode_lock(vnode_t *vp, int flags); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h index 6f0a436..d3955d7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_ZNODE_H @@ -29,8 +28,11 @@ #ifdef _KERNEL #include #include +#include #include #include +#include +#include #endif #include #include @@ -54,13 +56,18 @@ extern "C" { #define ZFS_OPAQUE 0x0000010000000000 #define ZFS_AV_QUARANTINED 0x0000020000000000 #define ZFS_AV_MODIFIED 0x0000040000000000 +#define ZFS_REPARSE 0x0000080000000000 +#define ZFS_OFFLINE 0x0000100000000000 +#define ZFS_SPARSE 0x0000200000000000 -#define ZFS_ATTR_SET(zp, attr, value) \ +#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ { \ if (value) \ - zp->z_phys->zp_flags |= attr; \ + pflags |= attr; \ else \ - zp->z_phys->zp_flags &= ~attr; \ + pflags &= ~attr; \ + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \ + &pflags, sizeof (pflags), tx)); \ } /* @@ -76,25 +83,46 @@ extern "C" { #define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ #define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ +#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] +#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] +#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] +#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] +#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] +#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] +#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] +#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] +#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] +#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] +#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] +#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] +#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] +#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] +#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] +#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] +#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] +#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] +#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] +#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] + /* * Is ID ephemeral? */ -#ifdef TODO #define IS_EPHEMERAL(x) (x > MAXUID) -#else -#define IS_EPHEMERAL(x) (0) -#endif /* * Should we use FUIDs? */ -#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) +#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) #define MASTER_NODE_OBJ 1 /* * Special attributes for master node. + * "userquota@" and "groupquota@" are also valid (from + * zfs_userquota_prop_prefixes[]). */ #define ZFS_FSID "FSID" #define ZFS_UNLINKED_SET "DELETE_QUEUE" @@ -102,6 +130,7 @@ extern "C" { #define ZPL_VERSION_STR "VERSION" #define ZFS_FUID_TABLES "FUID" #define ZFS_SHARES_DIR "SHARES" +#define ZFS_SA_ATTRS "SA_ATTRS" #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) @@ -132,42 +161,6 @@ extern "C" { #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) /* - * This is the persistent portion of the znode. It is stored - * in the "bonus buffer" of the file. Short symbolic links - * are also stored in the bonus buffer. - */ -typedef struct znode_phys { - uint64_t zp_atime[2]; /* 0 - last file access time */ - uint64_t zp_mtime[2]; /* 16 - last file modification time */ - uint64_t zp_ctime[2]; /* 32 - last file change time */ - uint64_t zp_crtime[2]; /* 48 - creation time */ - uint64_t zp_gen; /* 64 - generation (txg of creation) */ - uint64_t zp_mode; /* 72 - file mode bits */ - uint64_t zp_size; /* 80 - size of file */ - uint64_t zp_parent; /* 88 - directory parent (`..') */ - uint64_t zp_links; /* 96 - number of links to file */ - uint64_t zp_xattr; /* 104 - DMU object for xattrs */ - uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ - uint64_t zp_flags; /* 120 - persistent flags */ - uint64_t zp_uid; /* 128 - file owner */ - uint64_t zp_gid; /* 136 - owning group */ - uint64_t zp_zap; /* 144 - extra attributes */ - uint64_t zp_pad[3]; /* 152 - future */ - zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ - /* - * Data may pad out any remaining bytes in the znode buffer, eg: - * - * |<---------------------- dnode_phys (512) ------------------------>| - * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| - * |<---- znode (264) ---->|<---- data (56) ---->| - * - * At present, we use this space for the following: - * - symbolic links - * - 32-byte anti-virus scanstamp (regular files only) - */ -} znode_phys_t; - -/* * Directory entry locks control access to directory entries. * They are used to protect creates, deletes, and renames. * Each directory znode has a mutex and a list of locked names. @@ -196,20 +189,24 @@ typedef struct znode { uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ + uint8_t z_moved; /* Has this znode been moved? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ - uint64_t z_last_itx; /* last ZIL itx on this znode */ - uint64_t z_gen; /* generation (same as zp_gen) */ + uint64_t z_gen; /* generation (cached) */ + uint64_t z_size; /* file size (cached) */ + uint64_t z_atime[2]; /* atime (cached) */ + uint64_t z_links; /* file links (cached) */ + uint64_t z_pflags; /* pflags (cached) */ + uint64_t z_uid; /* uid fuid (cached) */ + uint64_t z_gid; /* gid fuid (cached) */ + mode_t z_mode; /* mode (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ list_node_t z_link_node; /* all znodes in fs link */ - /* - * These are dmu managed fields. - */ - znode_phys_t *z_phys; /* pointer to persistent znode */ - dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ + sa_handle_t *z_sa_hdl; /* handle to sa data */ + boolean_t z_is_sa; /* are we native sa? */ /* FreeBSD-specific field. */ struct task z_task; } znode_t; @@ -277,7 +274,7 @@ VTOZ(vnode_t *vp) #define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) #define ZFS_VERIFY_ZP(zp) \ - if ((zp)->z_dbuf == NULL) { \ + if ((zp)->z_sa_hdl == NULL) { \ ZFS_EXIT((zp)->z_zfsvfs); \ return (EIO); \ } \ @@ -319,14 +316,14 @@ VTOZ(vnode_t *vp) #define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ - zfs_time_stamper(zp, ACCESSED, NULL) + zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE); extern int zfs_init_fs(zfsvfs_t *, znode_t **); extern void zfs_set_dataprop(objset_t *); extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, dmu_tx_t *tx); -extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); -extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], + uint64_t [2], boolean_t); extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); extern void zfs_znode_init(void); @@ -349,7 +346,8 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, vattr_t *vap); extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name); + znode_t *dzp, char *name, uint64_t foid); +#define ZFS_NO_OBJECT 0 /* no object id */ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name); extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, @@ -366,7 +364,7 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); #endif -extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h index efbf65e..a4c5575 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_ZIL_H #define _SYS_ZIL_H @@ -55,34 +56,40 @@ typedef struct zil_header { uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ uint64_t zh_replay_seq; /* highest replayed sequence number */ blkptr_t zh_log; /* log chain */ - uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ uint64_t zh_flags; /* header flags */ - uint64_t zh_pad[4]; + uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ + uint64_t zh_pad[3]; } zil_header_t; /* * zh_flags bit settings */ -#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ +#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ +#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ /* - * Log block trailer - structure at the end of the header and each log block + * Log block chaining. + * + * Log blocks are chained together. Originally they were chained at the + * end of the block. For performance reasons the chain was moved to the + * beginning of the block which allows writes for only the data being used. + * The older position is supported for backwards compatability. * - * The zit_bt contains a zbt_cksum which for the intent log is + * The zio_eck_t contains a zec_cksum which for the intent log is * the sequence number of this log block. A seq of 0 is invalid. - * The zbt_cksum is checked by the SPA against the sequence + * The zec_cksum is checked by the SPA against the sequence * number passed in the blk_cksum field of the blkptr_t */ -typedef struct zil_trailer { - uint64_t zit_pad; - blkptr_t zit_next_blk; /* next block in chain */ - uint64_t zit_nused; /* bytes in log block used */ - zio_block_tail_t zit_bt; /* block trailer */ -} zil_trailer_t; +typedef struct zil_chain { + uint64_t zc_pad; + blkptr_t zc_next_blk; /* next block in chain */ + uint64_t zc_nused; /* bytes in log block used */ + zio_eck_t zc_eck; /* block trailer */ +} zil_chain_t; #define ZIL_MIN_BLKSZ 4096ULL #define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE -#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) /* * The words of a log block checksum. @@ -150,16 +157,26 @@ typedef enum zil_create { #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ /* + * Transactions for write, truncate, setattr, acl_v0, and acl can be logged + * out of order. For convenience in the code, all such records must have + * lr_foid at the same offset. + */ +#define TX_OOO(txtype) \ + ((txtype) == TX_WRITE || \ + (txtype) == TX_TRUNCATE || \ + (txtype) == TX_SETATTR || \ + (txtype) == TX_ACL_V0 || \ + (txtype) == TX_ACL || \ + (txtype) == TX_WRITE2) + +/* * Format of log records. * The fields are carefully defined to allow them to be aligned * and sized the same on sparc & intel architectures. * Each log record has a common structure at the beginning. * - * Note, lrc_seq holds two different sequence numbers. Whilst in memory - * it contains the transaction sequence number. The log record on - * disk holds the sequence number of all log records which is used to - * ensure we don't replay the same record. The two sequence numbers are - * different because the transactions can now be pushed out of order. + * The log record on disk (lrc_seq) holds the sequence number of all log + * records which is used to ensure we don't replay the same record. */ typedef struct { /* common log record header */ uint64_t lrc_txtype; /* intent log transaction type */ @@ -169,6 +186,14 @@ typedef struct { /* common log record header */ } lr_t; /* + * Common start of all out-of-order record types (TX_OOO() above). + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id */ +} lr_ooo_t; + +/* * Handle option extended vattr attributes. * * Whenever new attributes are added the version number @@ -258,7 +283,7 @@ typedef struct { uint64_t lr_foid; /* file object to write */ uint64_t lr_offset; /* offset to write to */ uint64_t lr_length; /* user data length to write */ - uint64_t lr_blkoff; /* offset represented by lr_blkptr */ + uint64_t lr_blkoff; /* no longer used */ blkptr_t lr_blkptr; /* spa block pointer for replay */ /* write data will follow for small writes */ } lr_write_t; @@ -306,13 +331,34 @@ typedef struct { */ /* - * ZFS intent log transaction structure + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * In this mode, if we need to commit the write later, then the block + * is immediately written into the file system (using dmu_sync), + * and a pointer to the block is put into the log record. + * When the txg commits the block is linked in. + * This saves additionally writing the data into the log record. + * There are a few requirements for this to occur: + * - write is greater than zfs/zvol_immediate_write_sz + * - not using slogs (as slogs are assumed to always be faster + * than writing into the main pool) + * - the write occupies only one block + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (FSYNC or FDSYNC), the we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. */ typedef enum { WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ /* and put blkptr in log, rather than actual data) */ WR_COPIED, /* immediate - data is copied into lr_write_t */ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ + WR_NUM_STATES /* number of states */ } itx_wr_state_t; typedef struct itx { @@ -321,30 +367,19 @@ typedef struct itx { itx_wr_state_t itx_wr_state; /* write state */ uint8_t itx_sync; /* synchronous transaction */ uint64_t itx_sod; /* record size on disk */ + uint64_t itx_oid; /* object id */ lr_t itx_lr; /* common part of log record */ /* followed by type-specific part of lr_xx_t and its immediate data */ } itx_t; - -/* - * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done() - * to handle the cleanup of the dmu_sync() buffer write - */ -typedef struct { - zilog_t *zgd_zilog; /* zilog */ - blkptr_t *zgd_bp; /* block pointer */ - struct rl *zgd_rl; /* range lock */ -} zgd_t; - - -typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, +typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t txg); -typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, +typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); -extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, +extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); extern void zil_init(void); @@ -358,28 +393,33 @@ extern void zil_close(zilog_t *zilog); extern void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]); +extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); -extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); +extern void zil_itx_destroy(itx_t *itx); +extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); -extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); +extern void zil_commit(zilog_t *zilog, uint64_t oid); -extern int zil_vdev_offline(char *osname, void *txarg); -extern int zil_claim(char *osname, void *txarg); -extern int zil_check_log_chain(char *osname, void *txarg); +extern int zil_vdev_offline(const char *osname, void *txarg); +extern int zil_claim(const char *osname, void *txarg); +extern int zil_check_log_chain(const char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); -extern void zil_clean(zilog_t *zilog); -extern int zil_is_committed(zilog_t *zilog); +extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); extern int zil_suspend(zilog_t *zilog); extern void zil_resume(zilog_t *zilog); -extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); -extern void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr); +extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp); +extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); + +extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); + +extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); -extern int zil_disable; +extern int zil_replay_disable; #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h index 3f25829..1d4c0cc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_ZIL_IMPL_H #define _SYS_ZIL_IMPL_H @@ -43,12 +44,34 @@ typedef struct lwb { int lwb_sz; /* size of block and buffer */ char *lwb_buf; /* log write buffer */ zio_t *lwb_zio; /* zio for this buffer */ + dmu_tx_t *lwb_tx; /* tx for log block allocation */ uint64_t lwb_max_txg; /* highest txg in this lwb */ - txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ } lwb_t; /* + * Intent log transaction lists + */ +typedef struct itxs { + list_t i_sync_list; /* list of synchronous itxs */ + avl_tree_t i_async_tree; /* tree of foids for async itxs */ +} itxs_t; + +typedef struct itxg { + kmutex_t itxg_lock; /* lock for this structure */ + uint64_t itxg_txg; /* txg for this chain */ + uint64_t itxg_sod; /* total size on disk for this txg */ + itxs_t *itxg_itxs; /* sync and async itxs */ +} itxg_t; + +/* for async nodes we build up an AVL tree of lists of async itxs per file */ +typedef struct itx_async_node { + uint64_t ia_foid; /* file object id */ + list_t ia_list; /* list of async itxs for this foid */ + avl_node_t ia_node; /* AVL tree linkage */ +} itx_async_node_t; + +/* * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs * we've touched so we know which ones need a write cache flush at the end. */ @@ -57,6 +80,8 @@ typedef struct zil_vdev_node { avl_node_t zv_node; /* AVL tree linkage */ } zil_vdev_node_t; +#define ZIL_PREV_BLKS 16 + /* * Stable storage intent log management structure. One per dataset. */ @@ -68,9 +93,8 @@ struct zilog { objset_t *zl_os; /* object set we're logging */ zil_get_data_t *zl_get_data; /* callback to get object content */ zio_t *zl_root_zio; /* log writer root zio */ - uint64_t zl_itx_seq; /* next itx sequence number */ - uint64_t zl_commit_seq; /* committed upto this number */ - uint64_t zl_lr_seq; /* log record sequence number */ + uint64_t zl_lr_seq; /* on-disk log record sequence number */ + uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ uint64_t zl_replaying_seq; /* current replay seq number */ @@ -82,24 +106,39 @@ struct zilog { uint8_t zl_replay; /* replaying records while set */ uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_writer; /* boolean: write setup in progress */ - uint8_t zl_log_error; /* boolean: log write error */ - list_t zl_itx_list; /* in-memory itx list */ + uint8_t zl_logbias; /* latency or throughput */ + uint8_t zl_sync; /* synchronous or asynchronous */ + int zl_parse_error; /* last zil_parse() error */ + uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ + uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ + uint64_t zl_parse_blk_count; /* number of blocks parsed */ + uint64_t zl_parse_lr_count; /* number of log records parsed */ + uint64_t zl_next_batch; /* next batch number */ + uint64_t zl_com_batch; /* committed batch number */ + kcondvar_t zl_cv_batch[2]; /* batch condition variables */ + itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ + list_t zl_itx_commit_list; /* itx list to be committed */ uint64_t zl_itx_list_sz; /* total size of records on list */ uint64_t zl_cur_used; /* current commit log size used */ - uint64_t zl_prev_used; /* previous commit log size used */ list_t zl_lwb_list; /* in-flight log write list */ kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ - avl_tree_t zl_dva_tree; /* track DVAs during log parse */ + avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ + zil_header_t zl_old_header; /* debugging aid */ + uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_prev_rotor; /* rotor for zl_prev[] */ }; -typedef struct zil_dva_node { +typedef struct zil_bp_node { dva_t zn_dva; avl_node_t zn_node; -} zil_dva_node_t; +} zil_bp_node_t; + +#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ + sizeof (lr_write_t)) #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 049c122..355f560 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _ZIO_H @@ -38,12 +37,15 @@ extern "C" { #endif -#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ +/* + * Embedded checksum + */ +#define ZEC_MAGIC 0x210da7ab10c7a11ULL -typedef struct zio_block_tail { - uint64_t zbt_magic; /* for validation, endianness */ - zio_cksum_t zbt_cksum; /* 256-bit checksum */ -} zio_block_tail_t; +typedef struct zio_eck { + uint64_t zec_magic; /* for validation, endianness */ + zio_cksum_t zec_cksum; /* 256-bit checksum */ +} zio_eck_t; /* * Gang block headers are self-checksumming and contain an array @@ -51,16 +53,16 @@ typedef struct zio_block_tail { */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) + sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t) - \ + sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; - zio_block_tail_t zg_tail; + zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { @@ -73,12 +75,19 @@ enum zio_checksum { ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_FUNCTIONS }; #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON +#define ZIO_CHECKSUM_MASK 0xffULL +#define ZIO_CHECKSUM_VERIFY (1 << 8) + +#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 +#define ZIO_DEDUPDITTO_MIN 100 + enum zio_compress { ZIO_COMPRESS_INHERIT = 0, ZIO_COMPRESS_ON, @@ -94,12 +103,19 @@ enum zio_compress { ZIO_COMPRESS_GZIP_7, ZIO_COMPRESS_GZIP_8, ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, ZIO_COMPRESS_FUNCTIONS }; #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + ((compress) == ZIO_COMPRESS_ON && \ + ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ + (compress) == ZIO_COMPRESS_OFF) + #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 @@ -115,73 +131,81 @@ enum zio_compress { #define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) #define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) #define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) -#define ZIO_PRIORITY_TABLE_SIZE 11 - -#define ZIO_FLAG_MUSTSUCCEED 0x000000 -#define ZIO_FLAG_CANFAIL 0x000001 -#define ZIO_FLAG_SPECULATIVE 0x000002 -#define ZIO_FLAG_CONFIG_WRITER 0x000004 -#define ZIO_FLAG_DONT_RETRY 0x000008 - -#define ZIO_FLAG_DONT_CACHE 0x000010 -#define ZIO_FLAG_DONT_QUEUE 0x000020 -#define ZIO_FLAG_DONT_AGGREGATE 0x000040 -#define ZIO_FLAG_DONT_PROPAGATE 0x000080 - -#define ZIO_FLAG_IO_BYPASS 0x000100 -#define ZIO_FLAG_IO_REPAIR 0x000200 -#define ZIO_FLAG_IO_RETRY 0x000400 -#define ZIO_FLAG_IO_REWRITE 0x000800 - -#define ZIO_FLAG_SELF_HEAL 0x001000 -#define ZIO_FLAG_RESILVER 0x002000 -#define ZIO_FLAG_SCRUB 0x004000 -#define ZIO_FLAG_SCRUB_THREAD 0x008000 - -#define ZIO_FLAG_PROBE 0x010000 -#define ZIO_FLAG_GANG_CHILD 0x020000 -#define ZIO_FLAG_RAW 0x040000 -#define ZIO_FLAG_GODFATHER 0x080000 - -#define ZIO_FLAG_TRYHARD 0x100000 - -#define ZIO_FLAG_GANG_INHERIT \ - (ZIO_FLAG_CANFAIL | \ - ZIO_FLAG_SPECULATIVE | \ - ZIO_FLAG_CONFIG_WRITER | \ - ZIO_FLAG_DONT_RETRY | \ - ZIO_FLAG_DONT_CACHE | \ - ZIO_FLAG_DONT_AGGREGATE | \ - ZIO_FLAG_SELF_HEAL | \ - ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB | \ - ZIO_FLAG_SCRUB_THREAD) - -#define ZIO_FLAG_VDEV_INHERIT \ - (ZIO_FLAG_GANG_INHERIT | \ - ZIO_FLAG_IO_REPAIR | \ - ZIO_FLAG_IO_RETRY | \ - ZIO_FLAG_PROBE | \ - ZIO_FLAG_TRYHARD) - -#define ZIO_FLAG_AGG_INHERIT \ - (ZIO_FLAG_DONT_AGGREGATE | \ - ZIO_FLAG_IO_REPAIR | \ - ZIO_FLAG_SELF_HEAL | \ - ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB | \ - ZIO_FLAG_SCRUB_THREAD) +#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) +#define ZIO_PRIORITY_TABLE_SIZE 12 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 +enum zio_flag { + /* + * Flags inherited by gang, ddt, and vdev children, + * and that must be equal for two zios to aggregate + */ + ZIO_FLAG_DONT_AGGREGATE = 1 << 0, + ZIO_FLAG_IO_REPAIR = 1 << 1, + ZIO_FLAG_SELF_HEAL = 1 << 2, + ZIO_FLAG_RESILVER = 1 << 3, + ZIO_FLAG_SCRUB = 1 << 4, + ZIO_FLAG_SCAN_THREAD = 1 << 5, + +#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) + + /* + * Flags inherited by ddt, gang, and vdev children. + */ + ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1 << 7, + ZIO_FLAG_CONFIG_WRITER = 1 << 8, + ZIO_FLAG_DONT_RETRY = 1 << 9, + ZIO_FLAG_DONT_CACHE = 1 << 10, + ZIO_FLAG_NODATA = 1 << 11, + ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, + +#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) +#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) + + /* + * Flags inherited by vdev children. + */ + ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 14, + ZIO_FLAG_TRYHARD = 1 << 15, + ZIO_FLAG_OPTIONAL = 1 << 16, + +#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) + + /* + * Flags not inherited by any children. + */ + ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 18, + ZIO_FLAG_IO_BYPASS = 1 << 19, + ZIO_FLAG_IO_REWRITE = 1 << 20, + ZIO_FLAG_RAW = 1 << 21, + ZIO_FLAG_GANG_CHILD = 1 << 22, + ZIO_FLAG_DDT_CHILD = 1 << 23, + ZIO_FLAG_GODFATHER = 1 << 24 +}; + +#define ZIO_FLAG_MUSTSUCCEED 0 + +#define ZIO_DDT_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ + ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) + #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) +#define ZIO_VDEV_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ + ZIO_FLAG_CANFAIL) + enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, + ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; @@ -193,13 +217,13 @@ enum zio_wait_type { }; /* - * We'll take the EILSEQ and ENOMSG to indicate checksum errors and - * fragmentation. + * We'll take the number 122 and 123 to indicate checksum errors and + * fragmentation. Those doesn't collide with any errno values as they + * are greater than ELAST. */ -#define ECKSUM EILSEQ -#define EFRAGS ENOMSG +#define ECKSUM 122 +#define EFRAGS 123 -typedef struct zio zio_t; typedef void zio_done_func_t(zio_t *zio); extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; @@ -208,18 +232,15 @@ extern char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) - * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is - * level -1 of the meta-dnode, and intent log blocks (which are chained - * off the root block) have blkid == sequence number. In summary: + * is objset 0, and the meta-dnode is object 0. This covers all blocks + * except root blocks and ZIL blocks, which are defined as follows: * - * mos is objset 0 - * meta-dnode is object 0 - * root block is - * intent log is + * Root blocks (objset_phys_t) are object 0, level -1: . + * ZIL blocks are bookmarked . + * dmu_sync()ed ZIL data blocks are bookmarked . * - * Note: this structure is called a bookmark because its first purpose was - * to remember where to resume a pool-wide traverse. The absolute ordering - * for block visitation during traversal is defined in compare_bookmark(). + * Note: this structure is called a bookmark because its original purpose + * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel. * Therefore it must not change size or alignment between 32/64 bit @@ -232,14 +253,66 @@ typedef struct zbookmark { uint64_t zb_blkid; } zbookmark_t; +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define ZB_DESTROYED_OBJSET (-1ULL) + +#define ZB_ROOT_OBJECT (0ULL) +#define ZB_ROOT_LEVEL (-1LL) +#define ZB_ROOT_BLKID (0ULL) + +#define ZB_ZIL_OBJECT (0ULL) +#define ZB_ZIL_LEVEL (-2LL) + typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; dmu_object_type_t zp_type; uint8_t zp_level; - uint8_t zp_ndvas; + uint8_t zp_copies; + uint8_t zp_dedup; + uint8_t zp_dedup_verify; } zio_prop_t; +typedef struct zio_cksum_report zio_cksum_report_t; + +typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, + const void *good_data); +typedef void zio_cksum_free_f(void *cbdata, size_t size); + +struct zio_bad_cksum; /* defined in zio_checksum.h */ + +struct zio_cksum_report { + struct zio_cksum_report *zcr_next; + nvlist_t *zcr_ereport; + nvlist_t *zcr_detector; + void *zcr_cbdata; + size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_align; + uint64_t zcr_length; + zio_cksum_finish_f *zcr_finish; + zio_cksum_free_f *zcr_free; + + /* internal use only */ + struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ +}; + +typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, + void *arg); + +zio_vsd_cksum_report_f zio_vsd_default_cksum_report; + +typedef struct zio_vsd_ops { + zio_done_func_t *vsd_free; + zio_vsd_cksum_report_f *vsd_cksum_report; +} zio_vsd_ops_t; + typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; @@ -290,6 +363,7 @@ struct zio { uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; + blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; @@ -301,16 +375,20 @@ struct zio { zio_done_func_t *io_ready; zio_done_func_t *io_done; void *io_private; + int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* Data represented by this I/O */ void *io_data; + void *io_orig_data; uint64_t io_size; + uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; - zio_done_func_t *io_vsd_free; + const zio_vsd_ops_t *io_vsd_ops; + uint64_t io_offset; uint64_t io_deadline; avl_node_t io_offset_node; @@ -318,15 +396,17 @@ struct zio { avl_tree_t *io_vdev_tree; /* Internal pipeline state */ - int io_flags; - zio_stage_t io_stage; - uint32_t io_pipeline; - int io_orig_flags; - zio_stage_t io_orig_stage; - uint32_t io_orig_pipeline; + enum zio_flag io_flags; + enum zio_stage io_stage; + enum zio_stage io_pipeline; + enum zio_flag io_orig_flags; + enum zio_stage io_orig_stage; + enum zio_stage io_orig_pipeline; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t io_child_count; + uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -336,6 +416,7 @@ struct zio { kcondvar_t io_cv; /* FMA state */ + zio_cksum_report_t *io_cksum_report; uint64_t io_ena; #ifdef _KERNEL @@ -346,47 +427,53 @@ struct zio { }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, - zio_done_func_t *done, void *private, int flags); + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, - zio_done_func_t *done, void *private, int flags); + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb); + int priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_prop_t *zp, + void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb); + int priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, zbookmark_t *zb); + int priority, enum zio_flag flags, zbookmark_t *zb); -extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags); +extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); -extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags); +extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); + +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags, + zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags, + zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels); -extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t txg); -extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); +extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, enum zio_flag flags); + +extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t size, boolean_t use_slog); +extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); +extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); @@ -407,11 +494,11 @@ extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, - int flags, zio_done_func_t *done, void *private); + enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, - int flags, zio_done_func_t *done, void *private); + enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); @@ -420,8 +507,12 @@ extern void zio_vdev_io_redone(zio_t *zio); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); -extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); -extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); +extern enum zio_checksum zio_checksum_select(enum zio_checksum child, + enum zio_checksum parent); +extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, + enum zio_checksum child, enum zio_checksum parent); +extern enum zio_compress zio_compress_select(enum zio_compress child, + enum zio_compress parent); extern void zio_suspend(spa_t *spa, zio_t *zio); extern int zio_resume(spa_t *spa); @@ -443,9 +534,30 @@ extern int zio_inject_fault(char *name, int flags, int *id, extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); +extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); +extern void zio_handle_ignored_writes(zio_t *zio); + +/* + * Checksum ereport functions + */ +extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, + uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); +extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const void *good_data, const void *bad_data, boolean_t drop_if_identical); + +extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); +extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); + +/* If we have the good data in hand, this function can be used */ +extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, + const void *good_data, const void *bad_data, struct zio_bad_cksum *info); + +/* Called from spa_sync(), but primarily an injection handler */ +extern void spa_handle_ignored_writes(spa_t *spa); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h index da40739..0956c04 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZIO_CHECKSUM_H @@ -43,28 +42,31 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); typedef struct zio_checksum_info { zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ int ci_correctable; /* number of correctable bits */ - int ci_zbt; /* uses zio block tail? */ + int ci_eck; /* uses zio embedded checksum? */ + int ci_dedup; /* strong enough for dedup? */ char *ci_name; /* descriptive name */ } zio_checksum_info_t; +typedef struct zio_bad_cksum { + zio_cksum_t zbc_expected; + zio_cksum_t zbc_actual; + const char *zbc_checksum_name; + uint8_t zbc_byteswapped; + uint8_t zbc_injected; + uint8_t zbc_has_cksum; /* expected/actual valid */ +} zio_bad_cksum_t; + extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t fletcher_2_native; -extern zio_checksum_t fletcher_4_native; -extern zio_checksum_t fletcher_4_incremental_native; - -extern zio_checksum_t fletcher_2_byteswap; -extern zio_checksum_t fletcher_4_byteswap; -extern zio_checksum_t fletcher_4_incremental_byteswap; - extern zio_checksum_t zio_checksum_SHA256; extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); -extern int zio_checksum_error(zio_t *zio); +extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); +extern enum zio_checksum spa_dedup_checksum(spa_t *spa); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h index 66ee8d4..30bed1a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -66,14 +64,18 @@ extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, int level); +extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. */ -extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, - void **destp, uint64_t *destsizep, uint64_t *destbufsizep); -extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, - void *dest, uint64_t destsize); +extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, + size_t s_len); +extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h index e7503b7..d90bd8b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -34,104 +34,136 @@ extern "C" { #endif /* - * I/O Groups: pipeline stage definitions. + * zio pipeline stage definitions */ -typedef enum zio_stage { - ZIO_STAGE_OPEN = 0, /* RWFCI */ +enum zio_stage { + ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ - ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ + ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */ - ZIO_STAGE_READ_BP_INIT, /* R---- */ - ZIO_STAGE_WRITE_BP_INIT, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ - ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ - ZIO_STAGE_DVA_FREE, /* --F-- */ - ZIO_STAGE_DVA_CLAIM, /* ---C- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ - ZIO_STAGE_READY, /* RWFCI */ + ZIO_STAGE_READY = 1 << 15, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ - ZIO_STAGE_DONE, /* RWFCI */ - ZIO_STAGES -} zio_stage_t; + ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ +}; -#define ZIO_INTERLOCK_STAGES \ - ((1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_INTERLOCK_STAGES \ + (ZIO_STAGE_READY | \ + ZIO_STAGE_DONE) -#define ZIO_INTERLOCK_PIPELINE \ +#define ZIO_INTERLOCK_PIPELINE \ ZIO_INTERLOCK_STAGES -#define ZIO_VDEV_IO_STAGES \ - ((1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_VDEV_IO_ASSESS)) +#define ZIO_VDEV_IO_STAGES \ + (ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_DONE | \ + ZIO_STAGE_VDEV_IO_ASSESS) -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DONE) -#define ZIO_READ_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY)) +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_CHECKSUM_VERIFY) -#define ZIO_READ_PHYS_PIPELINE \ +#define ZIO_READ_PHYS_PIPELINE \ ZIO_READ_COMMON_STAGES -#define ZIO_READ_PIPELINE \ - (ZIO_READ_COMMON_STAGES | \ - (1U << ZIO_STAGE_READ_BP_INIT)) +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + ZIO_STAGE_READ_BP_INIT) -#define ZIO_WRITE_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_ISSUE_ASYNC) | \ - (1U << ZIO_STAGE_CHECKSUM_GENERATE)) - -#define ZIO_WRITE_PHYS_PIPELINE \ - ZIO_WRITE_COMMON_STAGES - -#define ZIO_REWRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - (1U << ZIO_STAGE_WRITE_BP_INIT)) - -#define ZIO_WRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - (1U << ZIO_STAGE_WRITE_BP_INIT) | \ - (1U << ZIO_STAGE_DVA_ALLOCATE)) - -#define ZIO_GANG_STAGES \ - ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \ - (1U << ZIO_STAGE_GANG_ISSUE)) +#define ZIO_DDT_CHILD_READ_PIPELINE \ + ZIO_READ_COMMON_STAGES -#define ZIO_FREE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_DVA_FREE)) +#define ZIO_DDT_READ_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_READ_BP_INIT | \ + ZIO_STAGE_DDT_READ_START | \ + ZIO_STAGE_DDT_READ_DONE) -#define ZIO_CLAIM_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_DVA_CLAIM)) +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_CHECKSUM_GENERATE) -#define ZIO_IOCTL_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_VDEV_IO_ASSESS)) +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES -#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \ - ((1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_DVA_ALLOCATE) | \ - (1U << ZIO_STAGE_DVA_CLAIM)) +#define ZIO_REWRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT) + +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_CHILD_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_CHECKSUM_GENERATE | \ + ZIO_STAGE_DDT_WRITE) + +#define ZIO_GANG_STAGES \ + (ZIO_STAGE_GANG_ASSEMBLE | \ + ZIO_STAGE_GANG_ISSUE) + +#define ZIO_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_DVA_FREE) + +#define ZIO_DDT_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_DDT_FREE) + +#define ZIO_CLAIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_DVA_CLAIM) + +#define ZIO_IOCTL_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) + +#define ZIO_BLOCKING_STAGES \ + (ZIO_STAGE_DVA_ALLOCATE | \ + ZIO_STAGE_DVA_CLAIM | \ + ZIO_STAGE_VDEV_IO_START) extern void zio_inject_init(void); extern void zio_inject_fini(void); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h new file mode 100644 index 0000000..dcd63f7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZRLOCK_H +#define _SYS_ZRLOCK_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zrlock { + kmutex_t zr_mtx; + volatile int32_t zr_refcount; + kcondvar_t zr_cv; + uint16_t zr_pad; +#ifdef ZFS_DEBUG + kthread_t *zr_owner; + const char *zr_caller; +#endif +} zrlock_t; + +extern void zrl_init(zrlock_t *); +extern void zrl_destroy(zrlock_t *); +#ifdef ZFS_DEBUG +#define zrl_add(_z) zrl_add_debug((_z), __func__) +extern void zrl_add_debug(zrlock_t *, const char *); +#else +extern void zrl_add(zrlock_t *); +#endif +extern void zrl_remove(zrlock_t *); +extern int zrl_tryenter(zrlock_t *); +extern void zrl_exit(zrlock_t *); +extern int zrl_is_zero(zrlock_t *); +extern int zrl_is_locked(zrlock_t *); +#ifdef ZFS_DEBUG +extern kthread_t *zrl_owner(zrlock_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZRLOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h index 2a6452a..c0a0a69 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h @@ -20,15 +20,12 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZVOL_H #define _SYS_ZVOL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -43,26 +40,41 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_create_minor(const char *, major_t); +extern int zvol_create_minor(const char *); extern int zvol_remove_minor(const char *); +extern void zvol_remove_minors(const char *); extern int zvol_set_volsize(const char *, major_t, uint64_t); -extern int zvol_set_volblocksize(const char *, uint64_t); +#ifdef sun extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); -#ifndef __FreeBSD__ extern int zvol_strategy(buf_t *bp); extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); -#endif +#endif /* sun */ extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp); extern int zvol_busy(void); extern void zvol_init(void); extern void zvol_fini(void); + +#ifdef sun +extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize, + uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, + void **rl_hdl, void **bonus_hdl); +extern uint64_t zvol_get_volume_size(void *minor_hdl); +extern int zvol_get_volume_wce(void *minor_hdl); +extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, + ssize_t resid, boolean_t sync); +#endif /* sun */ + +#ifdef __FreeBSD__ +extern int zvol_create_minors(const char *name); +#endif + #endif #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c index c69c117..0885f27 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -19,14 +19,15 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include +#include #include +#include #include /* @@ -36,24 +37,13 @@ static void txg_sync_thread(void *arg); static void txg_quiesce_thread(void *arg); -int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */ -extern int zfs_txg_synctime; -extern uint64_t zfs_write_limit_override; +int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, - "ZFS transaction groups (TXG)"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG"); TUNABLE_INT("vfs.zfs.txg.timeout", &zfs_txg_timeout); SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RDTUN, &zfs_txg_timeout, 0, "Maximum seconds worth of delta per txg"); -TUNABLE_INT("vfs.zfs.txg.synctime", &zfs_txg_synctime); -SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime, CTLFLAG_RDTUN, &zfs_txg_synctime, - 0, "Target seconds to sync a txg"); -TUNABLE_QUAD("vfs.zfs.txg.write_limit_override", &zfs_write_limit_override); -SYSCTL_UQUAD(_vfs_zfs_txg, OID_AUTO, write_limit_override, CTLFLAG_RW, - &zfs_write_limit_override, 0, - "Override maximum size of a txg to this size in bytes, " - "value of 0 means don't override"); /* * Prepare the txg subsystem. @@ -74,10 +64,12 @@ txg_init(dsl_pool_t *dp, uint64_t txg) for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); + list_create(&tx->tx_cpu[c].tc_callbacks[i], + sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); } } - rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); @@ -100,7 +92,6 @@ txg_fini(dsl_pool_t *dp) ASSERT(tx->tx_threads == 0); - rw_destroy(&tx->tx_suspend); mutex_destroy(&tx->tx_sync_lock); cv_destroy(&tx->tx_sync_more_cv); @@ -113,10 +104,15 @@ txg_fini(dsl_pool_t *dp) int i; mutex_destroy(&tx->tx_cpu[c].tc_lock); - for (i = 0; i < TXG_SIZE; i++) + for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); + list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); + } } + if (tx->tx_commit_cb_taskq != NULL) + taskq_destroy(tx->tx_commit_cb_taskq); + kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); bzero(tx, sizeof (tx_state_t)); @@ -196,7 +192,11 @@ txg_sync_stop(dsl_pool_t *dp) * Finish off any work in progress. */ ASSERT(tx->tx_threads == 2); - txg_wait_synced(dp, 0); + + /* + * We need to ensure that we've vacated the deferred space_maps. + */ + txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); /* * Wake all sync threads and wait for them to die. @@ -246,6 +246,17 @@ txg_rele_to_quiesce(txg_handle_t *th) } void +txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + list_move_tail(&tc->tc_callbacks[g], tx_callbacks); + mutex_exit(&tc->tc_lock); +} + +void txg_rele_to_sync(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; @@ -296,9 +307,61 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) } static void +txg_do_callbacks(void *arg) +{ + list_t *cb_list = arg; + + dmu_tx_do_callbacks(cb_list, 0); + + list_destroy(cb_list); + + kmem_free(cb_list, sizeof (list_t)); +} + +/* + * Dispatch the commit callbacks registered on this txg to worker threads. + */ +static void +txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) +{ + int c; + tx_state_t *tx = &dp->dp_tx; + list_t *cb_list; + + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + /* No need to lock tx_cpu_t at this point */ + + int g = txg & TXG_MASK; + + if (list_is_empty(&tc->tc_callbacks[g])) + continue; + + if (tx->tx_commit_cb_taskq == NULL) { + /* + * Commit callback taskq hasn't been created yet. + */ + tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", + max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, + TASKQ_PREPOPULATE); + } + + cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(cb_list, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + + list_move_tail(&tc->tc_callbacks[g], cb_list); + + (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) + txg_do_callbacks, cb_list, TQ_SLEEP); + } +} + +static void txg_sync_thread(void *arg) { dsl_pool_t *dp = arg; + spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; uint64_t start, delta; @@ -311,20 +374,19 @@ txg_sync_thread(void *arg) uint64_t txg; /* - * We sync when we're scrubbing, there's someone waiting + * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); - while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || - spa_shutting_down(dp->dp_spa)) && + while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); - delta = LBOLT - start; + delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); } @@ -342,8 +404,6 @@ txg_sync_thread(void *arg) if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); - rw_enter(&tx->tx_suspend, RW_WRITER); - /* * Consume the quiesced txg which has been handed off to * us. This may cause the quiescing thread to now be @@ -353,22 +413,24 @@ txg_sync_thread(void *arg) tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); - rw_exit(&tx->tx_suspend); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); - start = LBOLT; - spa_sync(dp->dp_spa, txg); - delta = LBOLT - start; + start = ddi_get_lbolt(); + spa_sync(spa, txg); + delta = ddi_get_lbolt() - start; mutex_enter(&tx->tx_sync_lock); - rw_enter(&tx->tx_suspend, RW_WRITER); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; - rw_exit(&tx->tx_suspend); cv_broadcast(&tx->tx_sync_done_cv); + + /* + * Dispatch commit callbacks to worker threads. + */ + txg_dispatch_callbacks(dp, txg); } } @@ -426,7 +488,7 @@ void txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) { tx_state_t *tx = &dp->dp_tx; - int timeout = LBOLT + ticks; + int timeout = ddi_get_lbolt() + ticks; /* don't delay if this txg could transition to quiesing immediately */ if (tx->tx_open_txg > txg || @@ -439,10 +501,10 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) return; } - while (LBOLT < timeout && + while (ddi_get_lbolt() < timeout && tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, - timeout - LBOLT); + timeout - ddi_get_lbolt()); mutex_exit(&tx->tx_sync_lock); } @@ -455,7 +517,7 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) - txg = tx->tx_open_txg; + txg = tx->tx_open_txg + TXG_DEFER_SIZE; if (tx->tx_sync_txg_waiting < txg) tx->tx_sync_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", @@ -506,21 +568,6 @@ txg_sync_waiting(dsl_pool_t *dp) tx->tx_quiesced_txg != 0); } -void -txg_suspend(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - /* XXX some code paths suspend when they are already suspended! */ - rw_enter(&tx->tx_suspend, RW_READER); -} - -void -txg_resume(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - rw_exit(&tx->tx_suspend); -} - /* * Per-txg object lists. */ @@ -578,6 +625,34 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg) } /* + * Add an entry to the end of the list (walks list to find end). + * Returns 0 if it's a new entry, 1 if it's already there. + */ +int +txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + int already_on_list; + + mutex_enter(&tl->tl_lock); + already_on_list = tn->tn_member[t]; + if (!already_on_list) { + txg_node_t **tp; + + for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) + continue; + + tn->tn_member[t] = 1; + tn->tn_next[t] = NULL; + *tp = tn; + } + mutex_exit(&tl->tl_lock); + + return (already_on_list); +} + +/* * Remove the head of the list and return it. */ void * diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c index 34d7e0c..692cda1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) ub->ub_txg = txg; ub->ub_guid_sum = rvd->vdev_guid_sum; ub->ub_timestamp = gethrestime_sec(); + ub->ub_software_version = SPA_VERSION; return (ub->ub_rootbp.blk_birth == txg); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index cb43af3..51a3c79 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -40,6 +39,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); @@ -61,6 +61,7 @@ static vdev_ops_t *vdev_ops_table[] = { #endif &vdev_file_ops, &vdev_missing_ops, + &vdev_hole_ops, NULL }; @@ -95,9 +96,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; - uint64_t c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } @@ -106,40 +106,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) } /* - * Get the replaceable or attachable device size. - * If the parent is a mirror or raidz, the replaceable size is the minimum - * psize of all its children. For the rest, just return our own psize. - * - * e.g. - * psize rsize - * root - - - * mirror/raidz - - - * disk1 20g 20g - * disk2 40g 20g - * disk3 80g 80g + * Get the minimum allocatable size. We define the allocatable size as + * the vdev's asize rounded to the nearest metaslab. This allows us to + * replace or attach devices which don't have the same physical size but + * can still satisfy the same number of allocations. */ uint64_t -vdev_get_rsize(vdev_t *vd) +vdev_get_min_asize(vdev_t *vd) { - vdev_t *pvd, *cvd; - uint64_t c, rsize; + vdev_t *pvd = vd->vdev_parent; + + /* + * The our parent is NULL (inactive spare or cache) or is the root, + * just return our own asize. + */ + if (pvd == NULL) + return (vd->vdev_asize); - pvd = vd->vdev_parent; + /* + * The top-level vdev just returns the allocatable size rounded + * to the nearest metaslab. + */ + if (vd == vd->vdev_top) + return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* - * If our parent is NULL or the root, just return our own psize. + * The allocatable space for a raidz vdev is N * sizeof(smallest child), + * so each child must provide at least 1/Nth of its asize. */ - if (pvd == NULL || pvd->vdev_parent == NULL) - return (vd->vdev_psize); + if (pvd->vdev_ops == &vdev_raidz_ops) + return (pvd->vdev_min_asize / pvd->vdev_children); - rsize = 0; + return (pvd->vdev_min_asize); +} - for (c = 0; c < pvd->vdev_children; c++) { - cvd = pvd->vdev_child[c]; - rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; - } +void +vdev_set_min_asize(vdev_t *vd) +{ + vd->vdev_min_asize = vdev_get_min_asize(vd); - return (rsize); + for (int c = 0; c < vd->vdev_children; c++) + vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * @@ -160,13 +167,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev) vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { - int c; vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); @@ -212,9 +218,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; } void @@ -249,9 +252,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; } /* @@ -262,17 +262,17 @@ vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; - int newc, c; + int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - for (c = newc = 0; c < oldc; c++) + for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); - for (c = newc = 0; c < oldc; c++) { + for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; @@ -287,7 +287,7 @@ vdev_compact_children(vdev_t *pvd) /* * Allocate and minimally initialize a vdev_t. */ -static vdev_t * +vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; @@ -299,21 +299,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) spa->spa_root_vdev = vd; } - if (guid == 0) { + if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ - while (guid == 0 || spa_guid_exists(guid, 0)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ - while (guid == 0 || - spa_guid_exists(spa_guid(spa), guid)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } @@ -324,6 +321,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_ishole = (ops == &vdev_hole_ops); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); @@ -384,6 +382,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (EINVAL); + } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); } /* @@ -400,6 +401,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (ENOTSUP); + if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) + return (ENOTSUP); + /* * Set the nparity property for RAID-Z vdevs. */ @@ -407,23 +411,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { - /* - * Currently, we can only support 2 parity devices. - */ - if (nparity == 0 || nparity > 2) + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (EINVAL); /* - * Older versions can only support 1 parity device. + * Previous versions could only support 1 or 2 parity + * device. */ - if (nparity == 2 && - spa_version(spa) < SPA_VERSION_RAID6) + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + return (ENOTSUP); + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) return (ENOTSUP); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ - if (spa_version(spa) >= SPA_VERSION_RAID6) + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (EINVAL); /* * Otherwise, we default to 1 parity device for RAID-Z. @@ -471,43 +476,86 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* + * Retrieve the vdev creation time. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, + &vd->vdev_crtxg); + + /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { + if (parent && !parent->vdev_parent && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, + &vd->vdev_removing); + } + + if (parent && !parent->vdev_parent) { + ASSERT(alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_ADD || + alloctype == VDEV_ALLOC_SPLIT || + alloctype == VDEV_ALLOC_ROOTPOOL); + vd->vdev_mg = metaslab_group_create(islog ? + spa_log_class(spa) : spa_normal_class(spa), vd); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && - (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || + alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_smo.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } + + if (alloctype == VDEV_ALLOC_ROOTPOOL) { + uint64_t spare = 0; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare) == 0 && spare) + spa_spare_add(vd); + } + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, + &vd->vdev_resilvering); + /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be - * valid in the current context. + * valid in the current context. Local vdevs will + * remain in the faulted state. */ - if (spa->spa_load_state == SPA_LOAD_OPEN) { + if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); + + if (vd->vdev_faulted || vd->vdev_degraded) { + char *aux; + + vd->vdev_label_aux = + VDEV_AUX_ERR_EXCEEDED; + if (nvlist_lookup_string(nv, + ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && + strcmp(aux, "external") == 0) + vd->vdev_label_aux = VDEV_AUX_EXTERNAL; + } } } @@ -524,7 +572,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, void vdev_free(vdev_t *vd) { - int c; spa_t *spa = vd->vdev_spa; /* @@ -534,11 +581,12 @@ vdev_free(vdev_t *vd) vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); @@ -547,8 +595,10 @@ vdev_free(vdev_t *vd) /* * Discard allocation state. */ - if (vd == vd->vdev_top) + if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + } ASSERT3U(vd->vdev_stat.vs_space, ==, 0); ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); @@ -668,14 +718,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { - int c; - if (vd == NULL) return; vd->vdev_top = tvd; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } @@ -694,8 +742,10 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; + mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); @@ -737,6 +787,7 @@ vdev_remove_parent(vdev_t *cvd) */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } @@ -756,16 +807,22 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; - metaslab_class_t *mc; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; - if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ + ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + /* + * This vdev is not being allocated from yet or is a hole. + */ + if (vd->vdev_ms_shift == 0) return (0); + ASSERT(!vd->vdev_ishole); + /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the current "typical" blocksize. @@ -777,14 +834,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) ASSERT(oldc <= newc); - if (vd->vdev_islog) - mc = spa->spa_log_class; - else - mc = spa->spa_normal_class; - - if (vd->vdev_mg == NULL) - vd->vdev_mg = metaslab_group_create(mc, vd); - mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { @@ -819,6 +868,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); } + if (txg == 0) + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); + + /* + * If the vdev is being removed we don't activate + * the metaslabs since we want to ensure that no new + * allocations are performed on this device. + */ + if (oldc == 0 && !vd->vdev_removing) + metaslab_group_activate(vd->vdev_mg); + + if (txg == 0) + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (0); } @@ -829,6 +892,7 @@ vdev_metaslab_fini(vdev_t *vd) uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { + metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) if (vd->vdev_ms[m] != NULL) metaslab_fini(vd->vdev_ms[m]); @@ -956,6 +1020,10 @@ vdev_probe(vdev_t *vd, zio_t *zio) vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + /* + * We can't change the vdev state in this context, so we + * kick off an async task to do it on our behalf. + */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); @@ -988,6 +1056,55 @@ vdev_probe(vdev_t *vd, zio_t *zio) return (NULL); } +static void +vdev_open_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_open_thread = curthread; + vd->vdev_open_error = vdev_open(vd); + vd->vdev_open_thread = NULL; +} + +boolean_t +vdev_uses_zvols(vdev_t *vd) +{ + if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, + strlen(ZVOL_DIR)) == 0) + return (B_TRUE); + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_uses_zvols(vd->vdev_child[c])) + return (B_TRUE); + return (B_FALSE); +} + +void +vdev_open_children(vdev_t *vd) +{ + taskq_t *tq; + int children = vd->vdev_children; + + /* + * in order to handle pools on top of zvols, do the opens + * in a single thread so that the same thread holds the + * spa_namespace_lock + */ + if (B_TRUE || vdev_uses_zvols(vd)) { + for (int c = 0; c < children; c++) + vd->vdev_child[c]->vdev_open_error = + vdev_open(vd->vdev_child[c]); + return; + } + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], + TQ_SLEEP) != 0); + + taskq_destroy(tq); +} + /* * Prepare a virtual device for access. */ @@ -996,13 +1113,12 @@ vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; - int c; uint64_t osize = 0; uint64_t asize, psize; uint64_t ashift = 0; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); @@ -1010,11 +1126,18 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; + vd->vdev_min_asize = vdev_get_min_asize(vd); + /* + * If this vdev is not removed, check its fault status. If it's + * faulted, bail out of the open. + */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - VDEV_AUX_ERR_EXCEEDED); + vd->vdev_label_aux); return (ENXIO); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); @@ -1024,6 +1147,11 @@ vdev_open(vdev_t *vd) error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + /* + * Reset the vdev_reopening flag so that we actually close + * the vdev on error. + */ + vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); @@ -1039,20 +1167,40 @@ vdev_open(vdev_t *vd) vd->vdev_removed = B_FALSE; + /* + * Recheck the faulted flag now that we have confirmed that + * the vdev is accessible. If we're faulted, bail. + */ + if (vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + vd->vdev_label_aux); + return (ENXIO); + } + if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { - vd->vdev_state = VDEV_STATE_HEALTHY; + vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } - for (c = 0; c < vd->vdev_children; c++) + /* + * For hole or missing vdevs we just return success. + */ + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) + return (0); + + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } + } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); @@ -1077,6 +1225,15 @@ vdev_open(vdev_t *vd) vd->vdev_psize = psize; + /* + * Make sure the allocatable size hasn't shrunk. + */ + if (asize < vd->vdev_min_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. @@ -1093,25 +1250,18 @@ vdev_open(vdev_t *vd) VDEV_AUX_BAD_LABEL); return (EINVAL); } + } - /* - * Make sure the device hasn't shrunk. - */ - if (asize < vd->vdev_asize) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); - } + /* + * If all children are healthy and the asize has increased, + * then we've experienced dynamic LUN growth. If automatic + * expansion is enabled then use the additional space. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && + (vd->vdev_expanding || spa->spa_autoexpand)) + vd->vdev_asize = asize; - /* - * If all children are healthy and the asize has increased, - * then we've experienced dynamic LUN growth. - */ - if (vd->vdev_state == VDEV_STATE_HEALTHY && - asize > vd->vdev_asize) { - vd->vdev_asize = asize; - } - } + vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the @@ -1119,8 +1269,8 @@ vdev_open(vdev_t *vd) */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_IO_FAILURE); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); return (error); } @@ -1150,12 +1300,11 @@ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - int c; nvlist_t *label; - uint64_t guid, top_guid; + uint64_t guid = 0, top_guid; uint64_t state; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (EBADF); @@ -1165,6 +1314,8 @@ vdev_validate(vdev_t *vd) * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + uint64_t aux_guid = 0; + nvlist_t *nvl; if ((label = vdev_label_read_config(vd)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -1172,6 +1323,18 @@ vdev_validate(vdev_t *vd) return (0); } + /* + * Determine if this vdev has been split off into another + * pool. If so, then refuse to open it. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, + &aux_guid) == 0 && aux_guid == spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_SPLIT_POOL); + nvlist_free(label); + return (0); + } + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -1180,6 +1343,11 @@ vdev_validate(vdev_t *vd) return (0); } + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) + != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, + &aux_guid) != 0) + aux_guid = 0; + /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's @@ -1187,12 +1355,16 @@ vdev_validate(vdev_t *vd) * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. + * + * If we split this vdev off instead, then we also check the + * original pool's guid. We don't want to consider the vdev + * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || - (vd->vdev_guid != guid && + ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -1211,11 +1383,11 @@ vdev_validate(vdev_t *vd) nvlist_free(label); /* - * If spa->spa_load_verbatim is true, no need to check the + * If this is a verbatim import, no need to check the * state of the pool. */ - if (!spa->spa_load_verbatim && - spa->spa_load_state == SPA_LOAD_OPEN && + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && + spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); @@ -1238,15 +1410,23 @@ void vdev_close(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + vdev_t *pvd = vd->vdev_parent; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + /* + * If our parent is reopening, then we are as well, unless we are + * going offline. + */ + if (pvd != NULL && pvd->vdev_reopening) + vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); + vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* - * We record the previous state before we close it, so that if we are + * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ @@ -1260,12 +1440,49 @@ vdev_close(vdev_t *vd) } void +vdev_hold(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + return; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_hold(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_hold(vd); +} + +void +vdev_rele(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + for (int c = 0; c < vd->vdev_children; c++) + vdev_rele(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_rele(vd); +} + +/* + * Reopen all interior vdevs and any unopened leaves. We don't actually + * reopen leaf vdevs which had previously been opened as they might deadlock + * on the spa_config_lock. Instead we only obtain the leaf's physical size. + * If the leaf has never been opened then open it, as usual. + */ +void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + /* set the reopening flag unless we're taking the vdev offline */ + vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); @@ -1278,12 +1495,8 @@ vdev_reopen(vdev_t *vd) (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && - !l2arc_vdev_present(vd)) { - uint64_t size = vdev_get_rsize(vd); - l2arc_add_vdev(spa, vd, - VDEV_LABEL_START_SIZE, - size - VDEV_LABEL_START_SIZE); - } + !l2arc_vdev_present(vd)) + l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd); } @@ -1323,33 +1536,23 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) return (0); } -/* - * The is the latter half of vdev_create(). It is distinct because it - * involves initiating transactions in order to do metaslab creation. - * For creation, we want to try to create all vdevs at once and then undo it - * if anything fails; this is much harder if we have pending transactions. - */ void -vdev_init(vdev_t *vd, uint64_t txg) +vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly 200 metaslabs per vdev. */ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); - - /* - * Initialize the vdev's metaslabs. This can't fail because - * there's nothing to read when creating all new metaslabs. - */ - VERIFY(vdev_metaslab_init(vd, txg) == 0); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); + ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); + ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); @@ -1364,7 +1567,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which - * the vdev has less than perfect replication. There are three kinds of DTL: + * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * @@ -1405,6 +1608,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); + ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(sm->sm_lock); if (!space_map_contains(sm, txg, size)) @@ -1458,14 +1662,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); - if (vd == spa->spa_root_vdev) + if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + mutex_enter(&vd->vdev_dtl_lock); if (scrub_txg != 0 && - (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { - /* XXX should check scrub_done? */ + (spa->spa_scrub_started || + (scn && scn->scn_phys.scn_errors == 0))) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl @@ -1550,6 +1756,8 @@ vdev_dtl_load(vdev_t *vd) if (smo->smo_object == 0) return (0); + ASSERT(!vd->vdev_ishole); + if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) return (error); @@ -1577,6 +1785,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) dmu_buf_t *db; dmu_tx_t *tx; + ASSERT(!vd->vdev_ishole); + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached) { @@ -1655,6 +1865,9 @@ vdev_dtl_required(vdev_t *vd) vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + if (!required && zio_injection_enabled) + required = !!zio_handle_device_injection(vd, NULL, ECHILD); + return (required); } @@ -1713,7 +1926,7 @@ vdev_load(vdev_t *vd) /* * If this is a top-level vdev, initialize its metaslabs. */ - if (vd == vd->vdev_top && + if (vd == vd->vdev_top && !vd->vdev_ishole && (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -1770,11 +1983,49 @@ vdev_validate_aux(vdev_t *vd) } void +vdev_remove(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + if (vd->vdev_dtl_smo.smo_object) { + ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); + vd->vdev_dtl_smo.smo_object = 0; + } + + if (vd->vdev_ms != NULL) { + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp == NULL || msp->ms_smo.smo_object == 0) + continue; + + ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); + msp->ms_smo.smo_object = 0; + } + } + + if (vd->vdev_ms_array) { + (void) dmu_object_free(mos, vd->vdev_ms_array, tx); + vd->vdev_ms_array = 0; + vd->vdev_ms_shift = 0; + } + dmu_tx_commit(tx); +} + +void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); + ASSERT(!vd->vdev_ishole); + while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); @@ -1790,6 +2041,8 @@ vdev_sync(vdev_t *vd, uint64_t txg) metaslab_t *msp; dmu_tx_t *tx; + ASSERT(!vd->vdev_ishole); + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1800,6 +2053,12 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } + /* + * Remove the metadata associated with this vdev once it's empty. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) + vdev_remove(vd, txg); + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); @@ -1822,11 +2081,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize) * not be opened, and no I/O is attempted. */ int -vdev_fault(spa_t *spa, uint64_t guid) +vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { - vdev_t *vd; + vdev_t *vd, *tvd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1834,19 +2093,28 @@ vdev_fault(spa_t *spa, uint64_t guid) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + + /* + * We don't directly use the aux state here, but if we do a + * vdev_reopen(), we need this value to be present to remember why we + * were faulted. + */ + vd->vdev_label_aux = aux; + /* * Faulted state takes precedence over degraded. */ + vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; - vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* - * If marking the vdev as faulted cause the top-level vdev to become - * unavailable, then back off and simply mark the vdev as degraded - * instead. + * If this device has the only valid copy of the data, then + * back off and simply mark the vdev as degraded instead. */ - if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -1854,12 +2122,10 @@ vdev_fault(spa_t *spa, uint64_t guid) * If we reopen the device and it's not dead, only then do we * mark it degraded. */ - vdev_reopen(vd); + vdev_reopen(tvd); - if (vdev_readable(vd)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, - VDEV_AUX_ERR_EXCEEDED); - } + if (vdev_readable(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); @@ -1871,11 +2137,11 @@ vdev_fault(spa_t *spa, uint64_t guid) * as I/O is concerned. */ int -vdev_degrade(spa_t *spa, uint64_t guid) +vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1892,7 +2158,7 @@ vdev_degrade(spa_t *spa, uint64_t guid) vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, - VDEV_AUX_ERR_EXCEEDED); + aux); return (spa_vdev_state_exit(spa, vd, 0)); } @@ -1906,9 +2172,9 @@ vdev_degrade(spa_t *spa, uint64_t guid) int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { - vdev_t *vd; + vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1916,13 +2182,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); - vdev_reopen(vd->vdev_top); + + /* XXX - L2ARC 1.0 does not support expansion */ + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + } + + vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = B_FALSE; + } + if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && @@ -1931,16 +2210,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; + if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { + + /* XXX - L2ARC 1.0 does not support expansion */ + if (vd->vdev_aux) + return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } return (spa_vdev_state_exit(spa, vd, 0)); } -int -vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +static int +vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; - int error; + int error = 0; + uint64_t generation; + metaslab_group_t *mg; - spa_vdev_state_enter(spa); +top: + spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1949,6 +2238,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; + mg = tvd->vdev_mg; + generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. @@ -1964,6 +2255,37 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* + * If the top-level is a slog and it has had allocations + * then proceed. We check that the vdev's metaslab group + * is not NULL since it's possible that we may have just + * added this vdev but not yet initialized its metaslabs. + */ + if (tvd->vdev_islog && mg != NULL) { + /* + * Prevent any future allocations. + */ + metaslab_group_passivate(mg); + (void) spa_vdev_state_exit(spa, vd, 0); + + error = spa_offline_log(spa); + + spa_vdev_state_enter(spa, SCL_ALLOC); + + /* + * Check to see if the config has changed. + */ + if (error || generation != spa->spa_config_generation) { + metaslab_group_activate(mg); + if (error) + return (spa_vdev_state_exit(spa, + vd, error)); + (void) spa_vdev_state_exit(spa, vd, 0); + goto top; + } + ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); + } + + /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level @@ -1978,28 +2300,30 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } + + /* + * Add the device back into the metaslab rotor so that + * once we online the device it's open for business. + */ + if (tvd->vdev_islog && mg != NULL) + metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - if (!tvd->vdev_islog || !vdev_is_dead(tvd)) - return (spa_vdev_state_exit(spa, vd, 0)); + return (spa_vdev_state_exit(spa, vd, 0)); +} - (void) spa_vdev_state_exit(spa, vd, 0); +int +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +{ + int error; - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN); - if (error) { - (void) vdev_online(spa, guid, 0, NULL); - return (error); - } - /* - * If we successfully offlined the log device then we need to - * sync out the current txg so that the "stubby" block can be - * removed by zil_sync(). - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - return (0); + mutex_enter(&spa->spa_vdev_top_lock); + error = vdev_offline_locked(spa, guid, flags); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); } /* @@ -2033,13 +2357,22 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { - vd->vdev_faulted = vd->vdev_degraded = 0; + /* + * When reopening in reponse to a clear event, it may be due to + * a fmadm repair request. In this case, if the device is + * still broken, we want to still post the ereport again. + */ + vd->vdev_forcefault = B_TRUE; + + vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; - vdev_reopen(vd); + vdev_reopen(vd == rvd ? rvd : vd->vdev_top); - if (vd != rvd) + vd->vdev_forcefault = B_FALSE; + + if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) @@ -2047,12 +2380,30 @@ vdev_clear(spa_t *spa, vdev_t *vd) spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); } + + /* + * When clearing a FMA-diagnosed fault, we always want to + * unspare the device, as we assume that the original spare was + * done in response to the FMA fault. + */ + if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state < VDEV_STATE_DEGRADED); + /* + * Holes and missing devices are always considered "dead". + * This simplifies the code since we don't have to check for + * these types of devices in the various code paths. + * Instead we rely on the fact that we skip over dead devices + * before issuing I/O to them. + */ + return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || + vd->vdev_ops == &vdev_missing_ops); } boolean_t @@ -2081,7 +2432,7 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write); + !vd->vdev_cant_write && !vd->vdev_ishole); } boolean_t @@ -2111,10 +2462,11 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_rsize(vd); + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; mutex_exit(&vd->vdev_stat_lock); /* @@ -2131,7 +2483,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } - vs->vs_scrub_examined += cvs->vs_scrub_examined; + cvs->vs_scan_removing = cvd->vdev_removing; mutex_exit(&vd->vdev_stat_lock); } } @@ -2148,6 +2500,19 @@ vdev_clear_stats(vdev_t *vd) } void +vdev_scan_stat_init(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; @@ -2191,8 +2556,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCRUB_THREAD) - vs->vs_scrub_repaired += psize; + if (flags & ZIO_FLAG_SCAN_THREAD) { + dsl_scan_phys_t *scn_phys = + &spa->spa_dsl_pool->dp_scan->scn_phys; + uint64_t *processed = &scn_phys->scn_processed; + + /* XXX cleanup? */ + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(processed, psize); + vs->vs_scan_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -2217,6 +2591,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize) !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; + /* + * Intent logs writes won't propagate their error to the root + * I/O so don't mark these types of failures as pool-level + * errors. + */ + if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + return; + mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) @@ -2230,14 +2612,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCRUB_THREAD))) { + (flags & ZIO_FLAG_SCAN_THREAD) || + spa->spa_claiming)) { /* - * This is either a normal write (not a repair), or it's a - * repair induced by the scrub thread. In the normal case, - * we commit the DTL change in the same txg as the block - * was born. In the scrub-induced repair case, we know that - * scrubs run in first-pass syncing context, so we commit - * the DTL change in spa->spa_syncing_txg. + * This is either a normal write (not a repair), or it's + * a repair induced by the scrub thread, or it's a repair + * made by zil_claim() during spa_load() in the first txg. + * In the normal case, we commit the DTL change in the same + * txg as the block was born. In the scrub-induced repair + * case, we know that scrubs run in first-pass syncing context, + * so we commit the DTL change in spa_syncing_txg(spa). + * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) @@ -2246,13 +2631,16 @@ vdev_stat_update(zio_t *zio, uint64_t psize) */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; - if (flags & ZIO_FLAG_SCRUB_THREAD) { + if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); - commit_txg = spa->spa_syncing_txg; + commit_txg = spa_syncing_txg(spa); + } else if (spa->spa_claiming) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + commit_txg = spa_first_txg(spa); } - ASSERT(commit_txg >= spa->spa_syncing_txg); + ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) @@ -2264,46 +2652,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } -void -vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) -{ - int c; - vdev_stat_t *vs = &vd->vdev_stat; - - for (c = 0; c < vd->vdev_children; c++) - vdev_scrub_stat_update(vd->vdev_child[c], type, complete); - - mutex_enter(&vd->vdev_stat_lock); - - if (type == POOL_SCRUB_NONE) { - /* - * Update completion and end time. Leave everything else alone - * so we can report what happened during the previous scrub. - */ - vs->vs_scrub_complete = complete; - vs->vs_scrub_end = gethrestime_sec(); - } else { - vs->vs_scrub_type = type; - vs->vs_scrub_complete = 0; - vs->vs_scrub_examined = 0; - vs->vs_scrub_repaired = 0; - vs->vs_scrub_start = gethrestime_sec(); - vs->vs_scrub_end = 0; - } - - mutex_exit(&vd->vdev_stat_lock); -} - /* - * Update the in-core space usage stats for this vdev and the root vdev. + * Update the in-core space usage stats for this vdev, its metaslab class, + * and the root vdev. */ void -vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, - boolean_t update_root) +vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); @@ -2319,28 +2680,26 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - if (update_root) { - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - /* - * Don't count non-normal (e.g. intent log) space as part of - * the pool's capacity. - */ - if (vd->vdev_mg->mg_class != spa->spa_normal_class) - return; - + if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); - rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } + + if (mc != NULL) { + ASSERT(rvd == vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + metaslab_class_space_update(mc, + alloc_delta, defer_delta, space_delta, dspace_delta); + } } /* @@ -2355,6 +2714,8 @@ vdev_config_dirty(vdev_t *vd) vdev_t *rvd = spa->spa_root_vdev; int c; + ASSERT(spa_writeable(spa)); + /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. @@ -2392,7 +2753,7 @@ vdev_config_dirty(vdev_t *vd) * sketchy, but it will work. */ nvlist_free(aux[c]); - aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } @@ -2413,7 +2774,8 @@ vdev_config_dirty(vdev_t *vd) } else { ASSERT(vd == vd->vdev_top); - if (!list_link_active(&vd->vdev_config_dirty_node)) + if (!list_link_active(&vd->vdev_config_dirty_node) && + !vd->vdev_ishole) list_insert_head(&spa->spa_config_dirty_list, vd); } } @@ -2442,6 +2804,7 @@ vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* @@ -2454,7 +2817,7 @@ vdev_state_dirty(vdev_t *vd) (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); - if (!list_link_active(&vd->vdev_state_dirty_node)) + if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_state_dirty_list, vd); } @@ -2481,13 +2844,18 @@ vdev_propagate_state(vdev_t *vd) vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; - int c; vdev_t *child; if (vd->vdev_children > 0) { - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; + /* + * Don't factor holes into the decision. + */ + if (child->vdev_ishole) + continue; + if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* @@ -2551,15 +2919,31 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) /* * If we are setting the vdev state to anything but an open state, then - * always close the underlying device. Otherwise, we keep accessible - * but invalid devices open forever. We don't call vdev_close() itself, - * because that implies some extra checks (offline, etc) that we don't - * want here. This is limited to leaf devices, because otherwise - * closing the device will affect other children. + * always close the underlying device unless the device has requested + * a delayed close (i.e. we're about to remove or fault the device). + * Otherwise, we keep accessible but invalid devices open forever. + * We don't call vdev_close() itself, because that implies some extra + * checks (offline, etc) that we don't want here. This is limited to + * leaf devices, because otherwise closing the device will affect other + * children. */ - if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + if (!vd->vdev_delayed_close && vdev_is_dead(vd) && + vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); + /* + * If we have brought this vdev back into service, we need + * to notify fmd so that it can gracefully repair any outstanding + * cases due to a missing device. We do this in all cases, even those + * that probably don't correlate to a repaired fault. This is sure to + * catch all cases, and we let the zfs-retire agent sort it out. If + * this is a transient state it's OK, as the retire agent will + * double-check the state of the vdev before repairing it. + */ + if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && + vd->vdev_prevstate != state) + zfs_post_state_change(spa, vd); + if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { @@ -2575,20 +2959,16 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { - /* - * Indicate to the ZFS DE that this device has been removed, and - * any recent errors should be ignored. - */ - zfs_post_remove(spa, vd); vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* - * If we fail to open a vdev during an import, we mark it as - * "not available", which signifies that it was never there to - * begin with. Failure to open such a device is not considered - * an error. + * If we fail to open a vdev during an import or recovery, we + * mark it as "not available", which signifies that it was + * never there to begin with. Failure to open such a device + * is not considered an error. */ - if (spa->spa_load_state == SPA_LOAD_IMPORT && + if ((spa_load_state(spa) == SPA_LOAD_IMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -2631,9 +3011,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; - case VDEV_AUX_IO_FAILURE: - class = FM_EREPORT_ZFS_IO_FAILURE; - break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } @@ -2682,7 +3059,7 @@ vdev_is_bootable(vdev_t *vd) return (B_FALSE); } - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } @@ -2690,31 +3067,84 @@ vdev_is_bootable(vdev_t *vd) return (B_TRUE); } +/* + * Load the state from the original vdev tree (ovd) which + * we've retrieved from the MOS config object. If the original + * vdev was offline or faulted then we transfer that state to the + * device in the current vdev tree (nvd). + */ void -vdev_load_log_state(vdev_t *vd, nvlist_t *nv) +vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { - uint_t c, children; - nvlist_t **child; - uint64_t val; - spa_t *spa = vd->vdev_spa; + spa_t *spa = nvd->vdev_spa; - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - vdev_load_log_state(vd->vdev_child[c], child[c]); - } + ASSERT(nvd->vdev_top->vdev_islog); + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); - if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv, - ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) { + for (int c = 0; c < nvd->vdev_children; c++) + vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); + if (nvd->vdev_ops->vdev_op_leaf) { /* - * It would be nice to call vdev_offline() - * directly but the pool isn't fully loaded and - * the txg threads have not been started yet. + * Restore the persistent vdev state */ - spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER); - vd->vdev_offline = val; - vdev_reopen(vd->vdev_top); - spa_config_exit(spa, SCL_STATE_ALL, FTAG); + nvd->vdev_offline = ovd->vdev_offline; + nvd->vdev_faulted = ovd->vdev_faulted; + nvd->vdev_degraded = ovd->vdev_degraded; + nvd->vdev_removed = ovd->vdev_removed; + } +} + +/* + * Determine if a log device has valid content. If the vdev was + * removed or faulted in the MOS config then we know that + * the content on the log device has already been written to the pool. + */ +boolean_t +vdev_log_state_valid(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && + !vd->vdev_removed) + return (B_TRUE); + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_log_state_valid(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Expand a vdev if possible. + */ +void +vdev_expand(vdev_t *vd, uint64_t txg) +{ + ASSERT(vd->vdev_top == vd); + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + VERIFY(vdev_metaslab_init(vd, txg) == 0); + vdev_config_dirty(vd); + } +} + +/* + * Split a vdev. + */ +void +vdev_split(vdev_t *vd) +{ + vdev_t *cvd, *pvd = vd->vdev_parent; + + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + cvd = pvd->vdev_child[0]; + if (pvd->vdev_children == 1) { + vdev_remove_parent(cvd); + cvd->vdev_splitting = B_TRUE; } + vdev_propagate_state(cvd); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c index 8fc3738..7978d61 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c @@ -184,7 +184,7 @@ vdev_cache_allocate(zio_t *zio) ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; - ve->ve_lastused = LBOLT; + ve->ve_lastused = ddi_get_lbolt(); ve->ve_data = zio_buf_alloc(VCBS); avl_add(&vc->vc_offset_tree, ve); @@ -201,9 +201,9 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) ASSERT(MUTEX_HELD(&vc->vc_lock)); ASSERT(ve->ve_fill_io == NULL); - if (ve->ve_lastused != LBOLT) { + if (ve->ve_lastused != ddi_get_lbolt()) { avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = LBOLT; + ve->ve_lastused = ddi_get_lbolt(); avl_add(&vc->vc_lastused_tree, ve); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index 5db7a6a..d741773 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include -#include +#include #include #include #include @@ -44,12 +43,71 @@ typedef struct vdev_disk_buf { zio_t *vdb_io; } vdev_disk_buf_t; +static void +vdev_disk_hold(vdev_t *vd) +{ + ddi_devid_t devid; + char *minor; + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') + return; + + /* + * Only prefetch path and devid info if the device has + * never been opened. + */ + if (vd->vdev_tsd != NULL) + return; + + if (vd->vdev_wholedisk == -1ULL) { + size_t len = strlen(vd->vdev_path) + 3; + char *buf = kmem_alloc(len, KM_SLEEP); + + (void) snprintf(buf, len, "%ss0", vd->vdev_path); + + (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); + kmem_free(buf, len); + } + + if (vd->vdev_name_vp == NULL) + (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); + + if (vd->vdev_devid != NULL && + ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { + (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); + ddi_devid_str_free(minor); + ddi_devid_free(devid); + } +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + if (vd->vdev_name_vp) { + VN_RELE_ASYNC(vd->vdev_name_vp, + dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); + vd->vdev_name_vp = NULL; + } + if (vd->vdev_devid_vp) { + VN_RELE_ASYNC(vd->vdev_devid_vp, + dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); + vd->vdev_devid_vp = NULL; + } +} + static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; - struct dk_minfo dkm; + struct dk_minfo_ext dkmext; int error; dev_t dev; int otyp; @@ -62,6 +120,16 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) return (EINVAL); } + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + dvd = vd->vdev_tsd; + goto skip_open; + } + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); /* @@ -79,12 +147,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. - * - * If the vdev is part of the root pool, we avoid opening it by path. - * We do this because there is no /dev path available early in boot, - * and if we try to open the device by path at a later point, we can - * deadlock when devfsadm attempts to open the underlying backing store - * file. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, @@ -96,7 +158,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = EINVAL; /* presume failure */ - if (vd->vdev_path != NULL && !spa_is_root(spa)) { + if (vd->vdev_path != NULL) { ddi_devid_t devid; if (vd->vdev_wholedisk == -1ULL) { @@ -167,7 +229,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ - if (error && vd->vdev_path != NULL && !spa_is_root(spa)) + if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } @@ -202,6 +264,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) kmem_free(physpath, MAXPATHLEN); } +skip_open: /* * Determine the actual size of the device. */ @@ -224,11 +287,11 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ - if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm, + if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext, FKIOCTL, kcred, NULL) != 0) - dkm.dki_lbsize = DEV_BSIZE; + dkmext.dki_pbsize = DEV_BSIZE; - *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1; + *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1; /* * Clear the nowritecache bit, so that on a vdev_reopen() we will @@ -244,7 +307,7 @@ vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; - if (dvd == NULL) + if (vd->vdev_reopening || dvd == NULL) return; if (dvd->vd_minor != NULL) @@ -256,6 +319,7 @@ vdev_disk_close(vdev_t *vd) if (dvd->vd_lh != NULL) (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); + vd->vdev_delayed_close = B_FALSE; kmem_free(dvd, sizeof (vdev_disk_t)); vd->vdev_tsd = NULL; } @@ -315,6 +379,11 @@ vdev_disk_ioctl_free(zio_t *zio) kmem_free(zio->io_vsd, sizeof (struct dk_callback)); } +static const zio_vsd_ops_t vdev_disk_vsd_ops = { + vdev_disk_ioctl_free, + zio_vsd_default_cksum_report +}; + static void vdev_disk_ioctl_done(void *zio_arg, int error) { @@ -355,7 +424,7 @@ vdev_disk_io_start(zio_t *zio) } zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); - zio->io_vsd_free = vdev_disk_ioctl_free; + zio->io_vsd_ops = &vdev_disk_vsd_ops; dkc->dkc_callback = vdev_disk_ioctl_done; dkc->dkc_flag = FLUSH_VOLATILE; @@ -427,14 +496,23 @@ vdev_disk_io_done(zio_t *zio) * asynchronous removal of the device. Otherwise, probe the device and * make sure it's still accessible. */ - if (zio->io_error == EIO) { + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { vdev_disk_t *dvd = vd->vdev_tsd; int state = DKIO_NONE; if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + /* + * We post the resource as soon as possible, instead of + * when the async removal actually happens, because the + * DE is using this information to discard previous I/O + * errors. + */ + zfs_post_remove(zio->io_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; } } } @@ -446,6 +524,8 @@ vdev_ops_t vdev_disk_ops = { vdev_disk_io_start, vdev_disk_io_done, NULL, + vdev_disk_hold, + vdev_disk_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -488,6 +568,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + *config = NULL; for (l = 0; l < VDEV_LABELS; l++) { uint64_t offset, state, txg = 0; @@ -522,6 +603,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) kmem_free(label, sizeof (vdev_label_t)); (void) ldi_close(vd_lh, FREAD, kcred); + if (*config == NULL) + error = EIDRM; return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c index 67bd110..be3cefc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -35,6 +34,18 @@ * Virtual device vector for files. */ +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { @@ -51,6 +62,17 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) return (EINVAL); } + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + vp = vf->vf_vnode; + goto skip_open; + } + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); /* @@ -65,6 +87,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; return (error); } @@ -77,9 +101,13 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) if (vp->v_type != VREG) { (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; return (ENODEV); } #endif + +skip_open: /* * Determine the physical size of the file. */ @@ -92,6 +120,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) if (error) { (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; return (error); } @@ -106,12 +136,15 @@ vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; - if (vf == NULL) + if (vd->vdev_reopening || vf == NULL) return; - if (vf->vf_vnode != NULL) + if (vf->vf_vnode != NULL) { (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); + } + + vd->vdev_delayed_close = B_FALSE; kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } @@ -168,6 +201,8 @@ vdev_ops_t vdev_file_ops = { vdev_file_io_start, vdev_file_io_done, NULL, + vdev_file_hold, + vdev_file_rele, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -184,6 +219,8 @@ vdev_ops_t vdev_disk_ops = { vdev_file_io_start, vdev_file_io_done, NULL, + vdev_file_hold, + vdev_file_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index fa42871..4d4b63c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -47,31 +47,39 @@ struct g_class zfs_vdev_class = { DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); +/* + * Don't send BIO_FLUSH. + */ +static int vdev_geom_bio_flush_disable = 0; +TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable); +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW, + &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); + static void vdev_geom_orphan(struct g_consumer *cp) { - struct g_geom *gp; vdev_t *vd; - int error; g_topology_assert(); vd = cp->private; - gp = cp->geom; - error = cp->provider->error; - ZFS_LOG(1, "Closing access to %s.", cp->provider->name); - if (cp->acr + cp->acw + cp->ace > 0) - g_access(cp, -cp->acr, -cp->acw, -cp->ace); - ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name); - g_detach(cp); - g_destroy_consumer(cp); - /* Destroy geom if there are no consumers left. */ - if (LIST_EMPTY(&gp->consumer)) { - ZFS_LOG(1, "Destroyed geom %s.", gp->name); - g_wither_geom(gp, error); - } - vd->vdev_tsd = NULL; + /* + * Orphan callbacks occur from the GEOM event thread. + * Concurrent with this call, new I/O requests may be + * working their way through GEOM about to find out + * (only once executed by the g_down thread) that we've + * been orphaned from our disk provider. These I/Os + * must be retired before we can detach our consumer. + * This is most easily achieved by acquiring the + * SPA ZIO configuration lock as a writer, but doing + * so with the GEOM topology lock held would cause + * a lock order reversal. Instead, rely on the SPA's + * async removal support to invoke a close on this + * vdev once it is safe to do so. + */ + zfs_post_remove(vd->vdev_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } @@ -223,16 +231,12 @@ vdev_geom_read_guid(struct g_consumer *cp) uint64_t psize; off_t offset, size; uint64_t guid; - int error, l, len, iszvol; + int error, l, len; g_topology_assert_not(); pp = cp->provider; ZFS_LOG(1, "Reading guid from %s...", pp->name); - if (g_getattr("ZFS::iszvol", cp, &iszvol) == 0 && iszvol) { - ZFS_LOG(1, "Skipping ZVOL-based provider %s.", pp->name); - return (0); - } psize = pp->mediasize; psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); @@ -270,11 +274,6 @@ vdev_geom_read_guid(struct g_consumer *cp) return (guid); } -struct vdev_geom_find { - uint64_t guid; - struct g_consumer *cp; -}; - static void vdev_geom_taste_orphan(struct g_consumer *cp) { @@ -283,25 +282,23 @@ vdev_geom_taste_orphan(struct g_consumer *cp) cp->provider->name)); } -static void -vdev_geom_attach_by_guid_event(void *arg, int flags __unused) +static struct g_consumer * +vdev_geom_attach_by_guid(uint64_t guid) { - struct vdev_geom_find *ap; struct g_class *mp; struct g_geom *gp, *zgp; struct g_provider *pp; - struct g_consumer *zcp; - uint64_t guid; + struct g_consumer *cp, *zcp; + uint64_t pguid; g_topology_assert(); - ap = arg; - zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); /* This orphan function should be never called. */ zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); + cp = NULL; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; @@ -317,39 +314,29 @@ vdev_geom_attach_by_guid_event(void *arg, int flags __unused) continue; } g_topology_unlock(); - guid = vdev_geom_read_guid(zcp); + pguid = vdev_geom_read_guid(zcp); g_topology_lock(); g_access(zcp, -1, 0, 0); g_detach(zcp); - if (guid != ap->guid) + if (pguid != guid) continue; - ap->cp = vdev_geom_attach(pp); - if (ap->cp == NULL) { + cp = vdev_geom_attach(pp); + if (cp == NULL) { printf("ZFS WARNING: Unable to attach to %s.\n", pp->name); continue; } - goto end; + break; } + if (cp != NULL) + break; } + if (cp != NULL) + break; } - ap->cp = NULL; end: g_destroy_consumer(zcp); g_destroy_geom(zgp); -} - -static struct g_consumer * -vdev_geom_attach_by_guid(uint64_t guid) -{ - struct vdev_geom_find *ap; - struct g_consumer *cp; - - ap = kmem_zalloc(sizeof(*ap), KM_SLEEP); - ap->guid = guid; - g_waitfor_event(vdev_geom_attach_by_guid_event, ap, M_WAITOK, NULL); - cp = ap->cp; - kmem_free(ap, sizeof(*ap)); return (cp); } @@ -360,6 +347,8 @@ vdev_geom_open_by_guid(vdev_t *vd) char *buf; size_t len; + g_topology_assert(); + ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid); cp = vdev_geom_attach_by_guid(vd->vdev_guid); if (cp != NULL) { @@ -387,8 +376,9 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid) struct g_consumer *cp; uint64_t guid; + g_topology_assert(); + cp = NULL; - g_topology_lock(); pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); @@ -410,7 +400,6 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid) } } } - g_topology_unlock(); return (cp); } @@ -420,7 +409,8 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { struct g_provider *pp; struct g_consumer *cp; - int error, owned; + size_t bufsize; + int error, lock; /* * We must have a pathname, and it must be absolute. @@ -432,15 +422,22 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vd->vdev_tsd = NULL; - if ((owned = mtx_owned(&Giant))) - mtx_unlock(&Giant); + if (mutex_owned(&spa_namespace_lock)) { + mutex_exit(&spa_namespace_lock); + lock = 1; + } else { + lock = 0; + } + DROP_GIANT(); + g_topology_lock(); error = 0; /* - * If we're creating pool, just find GEOM provider by its name - * and ignore GUID mismatches. + * If we're creating or splitting a pool, just find the GEOM provider + * by its name and ignore GUID mismatches. */ - if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE) + if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_splitting_newspa == B_TRUE) cp = vdev_geom_open_by_path(vd, 0); else { cp = vdev_geom_open_by_path(vd, 1); @@ -472,7 +469,6 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) { int i; - g_topology_lock(); for (i = 0; i < 5; i++) { error = g_access(cp, 0, 1, 0); if (error == 0) @@ -487,10 +483,11 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vdev_geom_detach(cp, 0); cp = NULL; } - g_topology_unlock(); } - if (owned) - mtx_lock(&Giant); + g_topology_unlock(); + PICKUP_GIANT(); + if (lock) + mutex_enter(&spa_namespace_lock); if (cp == NULL) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); @@ -516,6 +513,12 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ vd->vdev_nowritecache = B_FALSE; + if (vd->vdev_physpath != NULL) + spa_strfree(vd->vdev_physpath); + bufsize = sizeof("/dev/") + strlen(pp->name); + vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP); + snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name); + return (0); } @@ -528,30 +531,50 @@ vdev_geom_close(vdev_t *vd) if (cp == NULL) return; vd->vdev_tsd = NULL; + vd->vdev_delayed_close = B_FALSE; g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL); } static void vdev_geom_io_intr(struct bio *bp) { + vdev_t *vd; zio_t *zio; zio = bp->bio_caller1; + vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) zio->io_error = EIO; if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) { - vdev_t *vd; - /* * If we get ENOTSUP, we know that no future * attempts will ever succeed. In this case we * set a persistent bit so that we don't bother * with the ioctl in the future. */ - vd = zio->io_vd; vd->vdev_nowritecache = B_TRUE; } + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { + /* + * If provider's error is set we assume it is being + * removed. + */ + if (bp->bio_to->error != 0) { + /* + * We post the resource as soon as possible, instead of + * when the async removal actually happens, because the + * DE is using this information to discard previous I/O + * errors. + */ + /* XXX: zfs_post_remove() can sleep. */ + zfs_post_remove(zio->io_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } g_destroy_bio(bp); zio_interrupt(zio); } @@ -577,7 +600,7 @@ vdev_geom_io_start(zio_t *zio) case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush) + if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; if (vd->vdev_nowritecache) { @@ -628,6 +651,16 @@ vdev_geom_io_done(zio_t *zio) { } +static void +vdev_geom_hold(vdev_t *vd) +{ +} + +static void +vdev_geom_rele(vdev_t *vd) +{ +} + vdev_ops_t vdev_geom_ops = { vdev_geom_open, vdev_geom_close, @@ -635,6 +668,8 @@ vdev_ops_t vdev_geom_ops = { vdev_geom_io_start, vdev_geom_io_done, NULL, + vdev_geom_hold, + vdev_geom_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index 48d5fc2..c08ed8b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -141,6 +140,7 @@ #include #include #include +#include #include /* @@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - boolean_t isspare, boolean_t isl2cache) + vdev_config_flag_t flags) { nvlist_t *nv = NULL; @@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type) == 0); - if (!isspare && !isl2cache) + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); @@ -246,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * into a crufty old storage pool. */ ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity == 2 && - spa_version(spa) >= SPA_VERSION_RAID6)); + (vd->vdev_nparity <= 2 && + spa_version(spa) >= SPA_VERSION_RAIDZ2) || + (vd->vdev_nparity <= 3 && + spa_version(spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add the nparity tag even on storage pools @@ -268,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); - if (!isspare && !isl2cache && vd == vd->vdev_top) { + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && + vd == vd->vdev_top) { VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, @@ -279,42 +282,80 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_asize) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog) == 0); + if (vd->vdev_removing) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, + vd->vdev_removing) == 0); } if (vd->vdev_dtl_smo.smo_object != 0) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, vd->vdev_dtl_smo.smo_object) == 0); + if (vd->vdev_crtxg) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, + vd->vdev_crtxg) == 0); + if (getstats) { vdev_stat_t vs; + pool_scan_stat_t ps; + vdev_get_stats(vd, &vs); - VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, + VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); + + /* provide either current or previous scan information */ + if (spa_scan_get_stats(spa, &ps) == 0) { + VERIFY(nvlist_add_uint64_array(nv, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)) + == 0); + } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; - int c; + int c, idx; + + ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - for (c = 0; c < vd->vdev_children; c++) - child[c] = vdev_config_generate(spa, vd->vdev_child[c], - getstats, isspare, isl2cache); + for (c = 0, idx = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + /* + * If we're generating an nvlist of removing + * vdevs then skip over any device which is + * not being removed. + */ + if ((flags & VDEV_CONFIG_REMOVING) && + !cvd->vdev_removing) + continue; - VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - child, vd->vdev_children) == 0); + child[idx++] = vdev_config_generate(spa, cvd, + getstats, flags); + } + + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + } - for (c = 0; c < vd->vdev_children; c++) + for (c = 0; c < idx; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); } else { + const char *aux = NULL; + if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE) == 0); + if (vd->vdev_resilvering) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING, + B_TRUE) == 0); if (vd->vdev_faulted) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE) == 0); @@ -327,11 +368,66 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_unspare) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE) == 0); + if (vd->vdev_ishole) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, + B_TRUE) == 0); + + switch (vd->vdev_stat.vs_aux) { + case VDEV_AUX_ERR_EXCEEDED: + aux = "err_exceeded"; + break; + + case VDEV_AUX_EXTERNAL: + aux = "external"; + break; + } + + if (aux != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, + aux) == 0); + + if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, + vd->vdev_orig_guid) == 0); + } } return (nv); } +/* + * Generate a view of the top-level vdevs. If we currently have holes + * in the namespace, then generate an array which contains a list of holey + * vdevs. Additionally, add the number of top-level children that currently + * exist. + */ +void +vdev_top_config_generate(spa_t *spa, nvlist_t *config) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t *array; + uint_t c, idx; + + array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); + + for (c = 0, idx = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (tvd->vdev_ishole) + array[idx++] = c; + } + + if (idx) { + VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, + array, idx) == 0); + } + + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + rvd->vdev_children) == 0); + + kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); +} + nvlist_t * vdev_label_read_config(vdev_t *vd) { @@ -478,6 +574,15 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, return (B_TRUE); /* + * We can't rely on a pool's state if it's been imported + * read-only. Instead we look to see if the pools is marked + * read-only in the namespace and set the state to active. + */ + if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL && + spa_mode(spa) == FREAD) + state = POOL_STATE_ACTIVE; + + /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ @@ -514,6 +619,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) crtxg, reason)) != 0) return (error); + /* Track the creation time for this vdev */ + vd->vdev_crtxg = crtxg; + if (!vd->vdev_ops->vdev_op_leaf) return (0); @@ -526,7 +634,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Determine if the vdev is in use. */ - if (reason != VDEV_LABEL_REMOVE && + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (EBUSY); @@ -552,7 +660,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) */ if (reason == VDEV_LABEL_SPARE) return (0); - ASSERT(reason == VDEV_LABEL_REPLACE); + ASSERT(reason == VDEV_LABEL_REPLACE || + reason == VDEV_LABEL_SPLIT); } if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && @@ -617,7 +726,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } else { - label = spa_config_generate(spa, vd, 0ULL, B_FALSE); + uint64_t txg = 0ULL; + + if (reason == VDEV_LABEL_SPLIT) + txg = spa->spa_uberblock.ub_txg; + label = spa_config_generate(spa, vd, txg, B_FALSE); /* * Add our creation time. This allows us to detect multiple @@ -642,8 +755,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); + ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); + bzero(ub, VDEV_UBERBLOCK_RING); *ub = spa->spa_uberblock; ub->ub_txg = 0; @@ -672,11 +785,9 @@ retry: offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_write(zio, vd, l, ub, - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); - } + vdev_label_write(zio, vd, l, ub, + offsetof(vdev_label_t, vl_uberblock), + VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); @@ -688,7 +799,7 @@ retry: nvlist_free(label); zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); + zio_buf_free(ub, VDEV_UBERBLOCK_RING); zio_buf_free(vp, sizeof (vdev_phys_t)); /* @@ -717,11 +828,6 @@ retry: */ /* - * For use by zdb and debugging purposes only - */ -uint64_t ub_max_txg = UINT64_MAX; - -/* * Consider the following situation: txg is safely synced to disk. We've * written the first uberblock for txg + 1, and then we lose power. When we * come back up, we fail to see the uberblock for txg + 1 because, say, @@ -750,6 +856,7 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) static void vdev_uberblock_load_done(zio_t *zio) { + spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; uberblock_t *ubbest = rio->io_private; @@ -758,7 +865,7 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); - if (ub->ub_txg <= ub_max_txg && + if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, ubbest) > 0) *ubbest = *ub; mutex_exit(&rio->io_lock); @@ -976,6 +1083,9 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + + ASSERT(!vd->vdev_ishole); + zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index fff7e08..698c027 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,6 +60,11 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); } +static const zio_vsd_ops_t vdev_mirror_vsd_ops = { + vdev_mirror_map_free, + zio_vsd_default_cksum_report +}; + static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { @@ -117,28 +122,28 @@ vdev_mirror_map_alloc(zio_t *zio) } zio->io_vsd = mm; - zio->io_vsd_free = vdev_mirror_map_free; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); } static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - vdev_t *cvd; - uint64_t c; int numerrors = 0; - int ret, lasterror = 0; + int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_open_children(vd); - if ((ret = vdev_open(cvd)) != 0) { - lasterror = ret; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } @@ -158,9 +163,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_mirror_close(vdev_t *vd) { - uint64_t c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } @@ -211,7 +214,7 @@ vdev_mirror_child_select(zio_t *zio) uint64_t txg = zio->io_txg; int i, c; - ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); + ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); /* * Try to find a child whose DTL doesn't contain the block to read. @@ -449,6 +452,8 @@ vdev_ops_t vdev_mirror_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -460,6 +465,8 @@ vdev_ops_t vdev_replacing_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -471,6 +478,8 @@ vdev_ops_t vdev_spare_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c index 731f7d3..6a5588d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we * will fail the GUID sum check before ever trying to open the pool. */ - *psize = SPA_MINDEVSIZE; - *ashift = SPA_MINBLOCKSHIFT; + *psize = 0; + *ashift = 0; return (0); } @@ -80,6 +80,21 @@ vdev_ops_t vdev_missing_ops = { vdev_missing_io_start, vdev_missing_io_done, NULL, + NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; + +vdev_ops_t vdev_hole_ops = { + vdev_missing_open, + vdev_missing_close, + vdev_default_asize, + vdev_missing_io_start, + vdev_missing_io_done, + NULL, + NULL, + NULL, + VDEV_TYPE_HOLE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index de3f1db..b44f3b2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -24,7 +24,6 @@ */ #include -#include #include #include #include @@ -41,37 +40,48 @@ int zfs_vdev_max_pending = 10; int zfs_vdev_min_pending = 4; -/* deadline = pri + (LBOLT >> time_shift) */ +/* deadline = pri + ddi_get_lbolt64() >> time_shift) */ int zfs_vdev_time_shift = 6; /* exponential I/O issue ramp-up rate */ int zfs_vdev_ramp_rate = 2; /* - * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. - * For read i/os, we also aggregate across small adjacency gaps. + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; SYSCTL_DECL(_vfs_zfs_vdev); TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN, +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RW, &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device"); TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN, +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RW, &zfs_vdev_min_pending, 0, "Initial number of I/O requests pending to each device"); TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RDTUN, +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RW, &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline"); TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RDTUN, +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RW, &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate"); TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN, +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RW, &zfs_vdev_aggregation_limit, 0, "I/O requests are aggregated up to this size"); +TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RW, + &zfs_vdev_read_gap_limit, 0, + "Acceptable gap between two reads being aggregated"); +TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RW, + &zfs_vdev_write_gap_limit, 0, + "Acceptable gap between two writes being aggregated"); /* * Virtual device vector for disk I/O scheduling. @@ -191,12 +201,14 @@ vdev_queue_agg_io_done(zio_t *aio) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio, *nio; + zio_t *fio, *lio, *aio, *dio, *nio, *mio; avl_tree_t *t; int flags; uint64_t maxspan = zfs_vdev_aggregation_limit; uint64_t maxgap; + int stretch; +again: ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || @@ -211,21 +223,88 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* - * We can aggregate I/Os that are adjacent and of the - * same flavor, as expressed by the AGG_INHERIT flags. - * The latter is necessary so that certain attributes - * of the I/O, such as whether it's a normal I/O or a - * scrub/resilver, can be preserved in the aggregate. + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-option I/O. */ while ((dio = AVL_PREV(t, fio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) + IO_SPAN(dio, lio) <= maxspan && + IO_GAP(dio, fio) <= maxgap) { fio = dio; + if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = fio; + } + /* + * Skip any initial optional I/Os. + */ + while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { + fio = AVL_NEXT(t, fio); + ASSERT(fio != NULL); + } + + /* + * Walk forward through sufficiently contiguous I/Os. + */ while ((dio = AVL_NEXT(t, lio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) + IO_SPAN(fio, dio) <= maxspan && + IO_GAP(lio, dio) <= maxgap) { lio = dio; + if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = lio; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (t != &vq->vq_read_tree && mio != NULL) { + nio = lio; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } + + if (stretch) { + /* This may be a no-op. */ + VERIFY((dio = AVL_NEXT(t, lio)) != NULL); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + while (lio != mio && lio != fio) { + ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); + lio = AVL_PREV(t, lio); + ASSERT(lio != NULL); + } + } } if (fio != lio) { @@ -244,10 +323,15 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == t); - if (dio->io_type == ZIO_TYPE_WRITE) + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT(dio->io_type == ZIO_TYPE_WRITE); + bzero((char *)aio->io_data + (dio->io_offset - + aio->io_offset), dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); + } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); @@ -263,6 +347,20 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (fio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(fio); + zio_execute(fio); + mutex_enter(&vq->vq_lock); + goto again; + } + avl_add(&vq->vq_pending_tree, fio); return (fio); @@ -288,7 +386,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; + zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_priority; vdev_queue_io_add(vq, zio); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index 92753d8..4b0f560 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -35,12 +34,27 @@ /* * Virtual device vector for RAID-Z. * - * This vdev supports both single and double parity. For single parity, we - * use a simple XOR of all the data columns. For double parity, we use both - * the simple XOR as well as a technique described in "The mathematics of - * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), - * over the integers expressable in a single byte. Briefly, the operations on - * the field are defined as follows: + * This vdev supports single, double, and triple parity. For single parity, + * we use a simple XOR of all the data columns. For double or triple parity, + * we use a special case of Reed-Solomon coding. This extends the + * technique described in "The mathematics of RAID-6" by H. Peter Anvin by + * drawing on the system described in "A Tutorial on Reed-Solomon Coding for + * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the + * former is also based. The latter is designed to provide higher performance + * for writes. + * + * Note that the Plank paper claimed to support arbitrary N+M, but was then + * amended six years later identifying a critical flaw that invalidates its + * claims. Nevertheless, the technique can be adapted to work for up to + * triple parity. For additional parity, the amendment "Note: Correction to + * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding + * is viable, but the additional complexity means that write performance will + * suffer. + * + * All of the methods above operate on a Galois field, defined over the + * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements + * can be expressed with a single byte. Briefly, the operations on the + * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B @@ -55,22 +69,32 @@ * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * As an aside, this multiplication is derived from the error correcting + * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather - * than field addition). The inverse of a field element A (A^-1) is A^254. + * than field addition). The inverse of a field element A (A^-1) is therefore + * A ^ (255 - 1) = A^254. * - * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, - * can be expressed by field operations: + * The up-to-three parity columns, P, Q, R over several data columns, + * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 + * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * - * See the reconstruction code below for how P and Q can used individually or - * in concert to recover missing data columns. + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * XOR operation, and 2 and 4 can be computed quickly and generate linearly- + * independent coefficients. (There are no additional coefficients that have + * this property which is why the uncorrected Plank method breaks down.) + * + * See the reconstruction code below for how P, Q and R can used individually + * or in concert to recover missing data columns. */ typedef struct raidz_col { @@ -78,27 +102,60 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ void *rc_data; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ } raidz_col_t; typedef struct raidz_map { - uint64_t rm_cols; /* Column count */ + uint64_t rm_cols; /* Regular column count */ + uint64_t rm_scols; /* Count including skipped columns */ uint64_t rm_bigcols; /* Number of oversized columns */ uint64_t rm_asize; /* Actual total I/O size */ uint64_t rm_missingdata; /* Count of missing data devices */ uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ + uint64_t rm_nskip; /* Skipped sectors for padding */ + uint64_t rm_skipstart; /* Column index of padding start */ + void *rm_datacopy; /* rm_asize-buffer of copied data */ + uintptr_t rm_reports; /* # of referencing checksum reports */ + uint8_t rm_freed; /* map no longer has referencing ZIO */ + uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 + +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) + +/* + * We provide a mechanism to perform the field multiplication operation on a + * 64-bit value all at once rather than a byte at a time. This works by + * creating a mask from the top bit in each byte and using that to + * conditionally apply the XOR of 0x1d. + */ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ +{ \ + (mask) = (x) & 0x8080808080808080ULL; \ + (mask) = ((mask) << 1) - ((mask) >> 7); \ + (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ + ((mask) & 0x1d1d1d1d1d1d1d1d); \ +} -#define VDEV_RAIDZ_MAXPARITY 2 +#define VDEV_RAIDZ_64MUL_4(x, mask) \ +{ \ + VDEV_RAIDZ_64MUL_2((x), mask); \ + VDEV_RAIDZ_64MUL_2((x), mask); \ +} -#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) +/* + * Force reconstruction to use the general purpose method. + */ +int vdev_raidz_default_to_general; /* * These two tables represent powers and logs of 2 in the Galois field defined @@ -173,6 +230,8 @@ static const uint8_t vdev_raidz_log2[256] = { 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, }; +static void vdev_raidz_generate_parity(raidz_map_t *rm); + /* * Multiply a given number by 2 raised to the given power. */ @@ -193,17 +252,184 @@ vdev_raidz_exp2(uint_t a, int exp) } static void -vdev_raidz_map_free(zio_t *zio) +vdev_raidz_map_free(raidz_map_t *rm) { - raidz_map_t *rm = zio->io_vsd; int c; + size_t size; - for (c = 0; c < rm->rm_firstdatacol; c++) + for (c = 0; c < rm->rm_firstdatacol; c++) { zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); + if (rm->rm_col[c].rc_gdata != NULL) + zio_buf_free(rm->rm_col[c].rc_gdata, + rm->rm_col[c].rc_size); + } + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + if (rm->rm_datacopy != NULL) + zio_buf_free(rm->rm_datacopy, size); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); +} + +static void +vdev_raidz_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT3U(rm->rm_freed, ==, 0); + rm->rm_freed = 1; + + if (rm->rm_reports == 0) + vdev_raidz_map_free(rm); +} + +/*ARGSUSED*/ +static void +vdev_raidz_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed != 0) + vdev_raidz_map_free(rm); +} + +static void +vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + size_t c = zcr->zcr_cbinfo; + size_t x; + + const char *good = NULL; + const char *bad = rm->rm_col[c].rc_data; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + if (c < rm->rm_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical ZIO) + */ + if (rm->rm_col[0].rc_gdata == NULL) { + char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + char *buf; + + /* + * Set up the rm_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rm->rm_firstdatacol; x++) { + bad_parity[x] = rm->rm_col[x].rc_data; + rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + zio_buf_alloc(rm->rm_col[x].rc_size); + } + + /* fill in the data columns from good_data */ + buf = (char *)good_data; + for (; x < rm->rm_cols; x++) { + rm->rm_col[x].rc_data = buf; + buf += rm->rm_col[x].rc_size; + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity(rm); + + /* restore everything back to its original state */ + for (x = 0; x < rm->rm_firstdatacol; x++) + rm->rm_col[x].rc_data = bad_parity[x]; + + buf = rm->rm_datacopy; + for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { + rm->rm_col[x].rc_data = buf; + buf += rm->rm_col[x].rc_size; + } + } + + ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); + good = rm->rm_col[c].rc_gdata; + } else { + /* adjust good_data to point at the start of our column */ + good = good_data; + + for (x = rm->rm_firstdatacol; x < c; x++) + good += rm->rm_col[x].rc_size; + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_raidz_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + caddr_t buf; + + raidz_map_t *rm = zio->io_vsd; + size_t size; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_raidz_cksum_finish; + zcr->zcr_free = vdev_raidz_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_datacopy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. + */ + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + buf = rm->rm_datacopy = zio_buf_alloc(size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bcopy(col->rc_data, buf, col->rc_size); + col->rc_data = buf; + + buf += col->rc_size; + } + ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); } +static const zio_vsd_ops_t vdev_raidz_vsd_ops = { + vdev_raidz_map_free_vsd, + vdev_raidz_cksum_report +}; + static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) @@ -213,24 +439,40 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t s = zio->io_size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; - uint64_t q, r, c, bc, col, acols, coff, devidx; + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; q = s / (dcols - nparity); r = s - q * (dcols - nparity); bc = (r == 0 ? 0 : r + nparity); + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + if (q == 0) { + acols = bc; + scols = MIN(dcols, roundup(bc, nparity + 1)); + } else { + acols = dcols; + scols = dcols; + } - acols = (q == 0 ? bc : dcols); + ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); rm->rm_cols = acols; + rm->rm_scols = scols; rm->rm_bigcols = bc; - rm->rm_asize = 0; + rm->rm_skipstart = bc; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; + rm->rm_datacopy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; - for (c = 0; c < acols; c++) { + asize = 0; + + for (c = 0; c < scols; c++) { col = f + c; coff = o; if (col >= dcols) { @@ -239,15 +481,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; - rm->rm_asize += rm->rm_col[c].rc_size; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; } - rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); @@ -272,6 +526,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. */ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); @@ -283,10 +542,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; rm->rm_col[1].rc_devidx = devidx; rm->rm_col[1].rc_offset = o; + + if (rm->rm_skipstart == 0) + rm->rm_skipstart = 1; } zio->io_vsd = rm; - zio->io_vsd_free = vdev_raidz_map_free; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } @@ -305,12 +567,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p ^= *src; } } @@ -320,10 +582,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *q, *p, *src, pcount, ccount, mask, i; + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; int c; - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); @@ -331,55 +593,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount || ccount == 0); - for (i = 0; i < ccount; i++, p++, q++, src++) { - *q = *src; + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++) { *p = *src; + *q = *src; } - for (; i < pcount; i++, p++, q++, src++) { - *q = 0; + for (; i < pcnt; i++, src++, p++, q++) { *p = 0; + *q = 0; } } else { - ASSERT(ccount <= pcount); + ASSERT(ccnt <= pcnt); /* - * Rather than multiplying each byte individually (as - * described above), we are able to handle 8 at once - * by generating a mask based on the high bit in each - * byte and using that to conditionally XOR in 0x1d. + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. */ - for (i = 0; i < ccount; i++, p++, q++, src++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (i = 0; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +} + +static void +vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; } /* * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. */ - for (; i < pcount; i++, q++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); } } } } +/* + * Generate RAID parity in the first virtual columns according to the number of + * parity columns available. + */ static void -vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } +} + +static int +vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, i; + int x = tgts[0]; int c; + ASSERT(ntgts == 1); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(x < rm->rm_cols); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); ASSERT(xcount > 0); @@ -404,15 +749,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) *dst ^= *src; } } + + return (1 << VDEV_RAIDZ_P); } -static void -vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) +static int +vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, mask, i; uint8_t *b; + int x = tgts[0]; int c, j, exp; + ASSERT(ntgts == 1); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); @@ -436,23 +786,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) } } else { - /* - * For an explanation of this, see the comment in - * vdev_raidz_generate_parity_pq() above. - */ for (i = 0; i < count; i++, dst++, src++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } for (; i < xcount; i++, dst++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); } } } @@ -467,15 +807,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) *b = vdev_raidz_exp2(*b, exp); } } + + return (1 << VDEV_RAIDZ_Q); } -static void -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) +static int +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; void *pdata, *qdata; uint64_t xsize, ysize, i; + int x = tgts[0]; + int y = tgts[1]; + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rm->rm_firstdatacol); ASSERT(y < rm->rm_cols); @@ -553,93 +898,633 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) */ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + + return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } +/* BEGIN CSTYLED */ +/* + * In the general case of reconstruction, we must solve the system of linear + * equations defined by the coeffecients used to generate parity as well as + * the contents of the data and parity disks. This can be expressed with + * vectors for the original data (D) and the actual data (d) and parity (p) + * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): + * + * __ __ __ __ + * | | __ __ | p_0 | + * | V | | D_0 | | p_m-1 | + * | | x | : | = | d_0 | + * | I | | D_n-1 | | : | + * | | ~~ ~~ | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * I is simply a square identity matrix of size n, and V is a vandermonde + * matrix defined by the coeffecients we chose for the various parity columns + * (1, 2, 4). Note that these values were chosen both for simplicity, speedy + * computation as well as linear separability. + * + * __ __ __ __ + * | 1 .. 1 1 1 | | p_0 | + * | 2^n-1 .. 4 2 1 | __ __ | : | + * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | + * | 1 .. 0 0 0 | | D_1 | | d_0 | + * | 0 .. 0 0 0 | x | D_2 | = | d_1 | + * | : : : : | | : | | d_2 | + * | 0 .. 1 0 0 | | D_n-1 | | : | + * | 0 .. 0 1 0 | ~~ ~~ | : | + * | 0 .. 0 0 1 | | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * Note that I, V, d, and p are known. To compute D, we must invert the + * matrix and use the known data and parity values to reconstruct the unknown + * data values. We begin by removing the rows in V|I and d|p that correspond + * to failed or missing columns; we then make V|I square (n x n) and d|p + * sized n by removing rows corresponding to unused parity from the bottom up + * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' + * using Gauss-Jordan elimination. In the example below we use m=3 parity + * columns, n=8 data columns, with errors in d_1, d_2, and p_1: + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks + * | 19 205 116 29 64 16 4 1 | / / + * | 1 0 0 0 0 0 0 0 | / / + * | 0 1 0 0 0 0 0 0 | <--' / + * (V|I) = | 0 0 1 0 0 0 0 0 | <---' + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | + * | 19 205 116 29 64 16 4 1 | + * | 1 0 0 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We + * have carefully chosen the seed values 1, 2, and 4 to ensure that this + * matrix is not singular. + * __ __ + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 0 0 1 0 0 0 0 0 | + * | 167 100 5 41 159 169 217 208 | + * | 166 100 4 40 158 168 216 209 | + * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values + * of the missing data. + * + * As is apparent from the example above, the only non-trivial rows in the + * inverse matrix correspond to the data disks that we're trying to + * reconstruct. Indeed, those are the only rows we need as the others would + * only be useful for reconstructing data known or assumed to be valid. For + * that reason, we only build the coefficients in the rows that correspond to + * targeted columns. + */ +/* END CSTYLED */ -static int -vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +static void +vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, + uint8_t **rows) { - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; - int c, error; - int lasterror = 0; - int numerrors = 0; + int i, j; + int pow; - ASSERT(nparity > 0); + ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); - if (nparity > VDEV_RAIDZ_MAXPARITY || - vd->vdev_children < nparity + 1) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + /* + * Fill in the missing rows of interest. + */ + for (i = 0; i < nmap; i++) { + ASSERT3S(0, <=, map[i]); + ASSERT3S(map[i], <=, 2); + + pow = map[i] * n; + if (pow > 255) + pow -= 255; + ASSERT(pow <= 255); + + for (j = 0; j < n; j++) { + pow -= map[i]; + if (pow < 0) + pow += 255; + rows[i][j] = vdev_raidz_pow2[pow]; + } } +} - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; +static void +vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, + uint8_t **rows, uint8_t **invrows, const uint8_t *used) +{ + int i, j, ii, jj; + uint8_t log; - if ((error = vdev_open(cvd)) != 0) { - lasterror = error; - numerrors++; - continue; + /* + * Assert that the first nmissing entries from the array of used + * columns correspond to parity columns and that subsequent entries + * correspond to data columns. + */ + for (i = 0; i < nmissing; i++) { + ASSERT3S(used[i], <, rm->rm_firstdatacol); + } + for (; i < n; i++) { + ASSERT3S(used[i], >=, rm->rm_firstdatacol); + } + + /* + * First initialize the storage where we'll compute the inverse rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + invrows[i][j] = (i == j) ? 1 : 0; } + } - *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + /* + * Subtract all trivial rows from the rows of consequence. + */ + for (i = 0; i < nmissing; i++) { + for (j = nmissing; j < n; j++) { + ASSERT3U(used[j], >=, rm->rm_firstdatacol); + jj = used[j] - rm->rm_firstdatacol; + ASSERT3S(jj, <, n); + invrows[i][j] = rows[i][jj]; + rows[i][jj] = 0; + } } - *asize *= vd->vdev_children; + /* + * For each of the rows of interest, we must normalize it and subtract + * a multiple of it from the other rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < missing[i]; j++) { + ASSERT3U(rows[i][j], ==, 0); + } + ASSERT3U(rows[i][missing[i]], !=, 0); - if (numerrors > nparity) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); - } + /* + * Compute the inverse of the first element and multiply each + * element in the row by that value. + */ + log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; - return (0); -} + for (j = 0; j < n; j++) { + rows[i][j] = vdev_raidz_exp2(rows[i][j], log); + invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); + } -static void -vdev_raidz_close(vdev_t *vd) -{ - int c; + for (ii = 0; ii < nmissing; ii++) { + if (i == ii) + continue; - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); -} + ASSERT3U(rows[ii][missing[i]], !=, 0); -static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) -{ - uint64_t asize; - uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + log = vdev_raidz_log2[rows[ii][missing[i]]]; - asize = ((psize - 1) >> ashift) + 1; - asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); - asize = roundup(asize, nparity + 1) << ashift; + for (j = 0; j < n; j++) { + rows[ii][j] ^= + vdev_raidz_exp2(rows[i][j], log); + invrows[ii][j] ^= + vdev_raidz_exp2(invrows[i][j], log); + } + } + } - return (asize); + /* + * Verify that the data that is left in the rows are properly part of + * an identity matrix. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + if (j == missing[i]) { + ASSERT3U(rows[i][j], ==, 1); + } else { + ASSERT3U(rows[i][j], ==, 0); + } + } + } } static void -vdev_raidz_child_done(zio_t *zio) +vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, + int *missing, uint8_t **invrows, const uint8_t *used) { - raidz_col_t *rc = zio->io_private; + int i, j, x, cc, c; + uint8_t *src; + uint64_t ccount; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t log, val; + int ll; + uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; + uint8_t *p, *pp; + size_t psize; + + psize = sizeof (invlog[0][0]) * n * nmissing; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing; i++) { + invlog[i] = pp; + pp += n; + } - rc->rc_error = zio->io_error; - rc->rc_tried = 1; - rc->rc_skipped = 0; -} + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + ASSERT3U(invrows[i][j], !=, 0); + invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; + } + } -static int + for (i = 0; i < n; i++) { + c = used[i]; + ASSERT3U(c, <, rm->rm_cols); + + src = rm->rm_col[c].rc_data; + ccount = rm->rm_col[c].rc_size; + for (j = 0; j < nmissing; j++) { + cc = missing[j] + rm->rm_firstdatacol; + ASSERT3U(cc, >=, rm->rm_firstdatacol); + ASSERT3U(cc, <, rm->rm_cols); + ASSERT3U(cc, !=, c); + + dst[j] = rm->rm_col[cc].rc_data; + dcount[j] = rm->rm_col[cc].rc_size; + } + + ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); + + for (x = 0; x < ccount; x++, src++) { + if (*src != 0) + log = vdev_raidz_log2[*src]; + + for (cc = 0; cc < nmissing; cc++) { + if (x >= dcount[cc]) + continue; + + if (*src == 0) { + val = 0; + } else { + if ((ll = log + invlog[cc][i]) >= 255) + ll -= 255; + val = vdev_raidz_pow2[ll]; + } + + if (i == 0) + dst[cc][x] = val; + else + dst[cc][x] ^= val; + } + } + } + + kmem_free(p, psize); +} + +static int +vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +{ + int n, i, c, t, tt; + int nmissing_rows; + int missing_rows[VDEV_RAIDZ_MAXPARITY]; + int parity_map[VDEV_RAIDZ_MAXPARITY]; + + uint8_t *p, *pp; + size_t psize; + + uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *used; + + int code = 0; + + + n = rm->rm_cols - rm->rm_firstdatacol; + + /* + * Figure out which data columns are missing. + */ + nmissing_rows = 0; + for (t = 0; t < ntgts; t++) { + if (tgts[t] >= rm->rm_firstdatacol) { + missing_rows[nmissing_rows++] = + tgts[t] - rm->rm_firstdatacol; + } + } + + /* + * Figure out which parity columns to use to help generate the missing + * data columns. + */ + for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { + ASSERT(tt < ntgts); + ASSERT(c < rm->rm_firstdatacol); + + /* + * Skip any targeted parity columns. + */ + if (c == tgts[tt]) { + tt++; + continue; + } + + code |= 1 << c; + + parity_map[i] = c; + i++; + } + + ASSERT(code != 0); + ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); + + psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * + nmissing_rows * n + sizeof (used[0]) * n; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing_rows; i++) { + rows[i] = pp; + pp += n; + invrows[i] = pp; + pp += n; + } + used = pp; + + for (i = 0; i < nmissing_rows; i++) { + used[i] = parity_map[i]; + } + + for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (tt < nmissing_rows && + c == missing_rows[tt] + rm->rm_firstdatacol) { + tt++; + continue; + } + + ASSERT3S(i, <, n); + used[i] = c; + i++; + } + + /* + * Initialize the interesting rows of the matrix. + */ + vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + + /* + * Invert the matrix. + */ + vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + invrows, used); + + /* + * Reconstruct the missing data using the generated matrix. + */ + vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + invrows, used); + + kmem_free(p, psize); + + return (code); +} + +static int +vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY], *dt; + int ntgts; + int i, c; + int code; + int nbadparity, nbaddata; + int parity_valid[VDEV_RAIDZ_MAXPARITY]; + + /* + * The tgts list must already be sorted. + */ + for (i = 1; i < nt; i++) { + ASSERT(t[i] > t[i - 1]); + } + + nbadparity = rm->rm_firstdatacol; + nbaddata = rm->rm_cols - nbadparity; + ntgts = 0; + for (i = 0, c = 0; c < rm->rm_cols; c++) { + if (c < rm->rm_firstdatacol) + parity_valid[c] = B_FALSE; + + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rm->rm_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } else if (c >= rm->rm_firstdatacol) { + nbaddata--; + } else { + parity_valid[c] = B_TRUE; + nbadparity--; + } + } + + ASSERT(ntgts >= nt); + ASSERT(nbaddata >= 0); + ASSERT(nbaddata + nbadparity == ntgts); + + dt = &tgts[nbadparity]; + + /* + * See if we can use any of our optimized reconstruction routines. + */ + if (!vdev_raidz_default_to_general) { + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 2); + break; + + case 2: + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + + ASSERT(rm->rm_firstdatacol > 2); + + break; + } + } + + code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); + ASSERT(code > 0); + return (code); +} + +static int +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + uint64_t nparity = vd->vdev_nparity; + int c; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + vdev_open_children(vd); + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + *asize *= vd->vdev_children; + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_raidz_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + + asize = ((psize - 1) >> ashift) + 1; + asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +static void +vdev_raidz_child_done(zio_t *zio) +{ + raidz_col_t *rc = zio->io_private; + + rc->rc_error = zio->io_error; + rc->rc_tried = 1; + rc->rc_skipped = 0; +} + +static int vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_t *cvd; - blkptr_t *bp = zio->io_bp; raidz_map_t *rm; raidz_col_t *rc; - int c; + int c, i; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); @@ -647,13 +1532,7 @@ vdev_raidz_io_start(zio_t *zio) ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * Generate RAID parity in the first virtual columns. - */ - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; @@ -664,6 +1543,23 @@ vdev_raidz_io_start(zio_t *zio) vdev_raidz_child_done, rc)); } + /* + * Generate optional I/Os for any skipped sectors to improve + * aggregation contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rm->rm_scols); + if (c == rm->rm_scols) + c = 0; + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, + 1 << tvd->vdev_ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + return (ZIO_PIPELINE_CONTINUE); } @@ -671,8 +1567,7 @@ vdev_raidz_io_start(zio_t *zio) /* * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity - * data. + * last -- any errors along the way will force us to read the parity. */ for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; @@ -687,7 +1582,7 @@ vdev_raidz_io_start(zio_t *zio) rc->rc_skipped = 1; continue; } - if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else @@ -708,23 +1603,47 @@ vdev_raidz_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } + /* * Report a checksum error for a child of a RAID-Z device. */ static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc) +raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); + + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + &zbc); } +} + +/* + * We keep track of whether or not there were any injected errors, so that + * any ereports we generate can note it. + */ +static int +raidz_checksum_verify(zio_t *zio) +{ + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); + int ret = zio_checksum_error(zio, &zbc); + if (ret != 0 && zbc.zbc_injected != 0) + rm->rm_ecksuminjected = 1; + + return (ret); } /* @@ -748,17 +1667,14 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) bcopy(rc->rc_data, orig[c], rc->rc_size); } - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { - raidz_checksum_error(zio, rc); + raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = ECKSUM; ret++; } @@ -768,9 +1684,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) return (ret); } -static uint64_t raidz_corrected_p; -static uint64_t raidz_corrected_q; -static uint64_t raidz_corrected_pq; +/* + * Keep statistics on all the ways that we used parity to correct data. + */ +static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; static int vdev_raidz_worst_error(raidz_map_t *rm) @@ -783,19 +1700,177 @@ vdev_raidz_worst_error(raidz_map_t *rm) return (error); } +/* + * Iterate over all combinations of bad data and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + */ +static int +vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +{ + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + void *orig[VDEV_RAIDZ_MAXPARITY]; + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *tgts = &tstore[1]; + int current, next, i, c, n; + int code, ret = 0; + + ASSERT(total_errors < rm->rm_firstdatacol); + + /* + * This simplifies one edge condition. + */ + tgts[-1] = -1; + + for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + /* + * Initialize the targets array by finding the first n columns + * that contain no error. + * + * If there were no data errors, we need to ensure that we're + * always explicitly attempting to reconstruct at least one + * data column. To do this, we simply push the highest target + * up into the data columns. + */ + for (c = 0, i = 0; i < n; i++) { + if (i == n - 1 && data_errors == 0 && + c < rm->rm_firstdatacol) { + c = rm->rm_firstdatacol; + } + + while (rm->rm_col[c].rc_error != 0) { + c++; + ASSERT3S(c, <, rm->rm_cols); + } + + tgts[i] = c++; + } + + /* + * Setting tgts[n] simplifies the other edge condition. + */ + tgts[n] = rm->rm_cols; + + /* + * These buffers were allocated in previous iterations. + */ + for (i = 0; i < n - 1; i++) { + ASSERT(orig[i] != NULL); + } + + orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); + + current = 0; + next = tgts[current]; + + while (current != n) { + tgts[current] = next; + current = 0; + + /* + * Save off the original data that we're going to + * attempt to reconstruct. + */ + for (i = 0; i < n; i++) { + ASSERT(orig[i] != NULL); + c = tgts[i]; + ASSERT3S(c, >=, 0); + ASSERT3S(c, <, rm->rm_cols); + rc = &rm->rm_col[c]; + bcopy(rc->rc_data, orig[i], rc->rc_size); + } + + /* + * Attempt a reconstruction and exit the outer loop on + * success. + */ + code = vdev_raidz_reconstruct(rm, tgts, n); + if (raidz_checksum_verify(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); + + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + ASSERT(rc->rc_error == 0); + if (rc->rc_tried) + raidz_checksum_error(zio, rc, + orig[i]); + rc->rc_error = ECKSUM; + } + + ret = code; + goto done; + } + + /* + * Restore the original data. + */ + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + bcopy(orig[i], rc->rc_data, rc->rc_size); + } + + do { + /* + * Find the next valid column after the current + * position.. + */ + for (next = tgts[current] + 1; + next < rm->rm_cols && + rm->rm_col[next].rc_error != 0; next++) + continue; + + ASSERT(next <= tgts[current + 1]); + + /* + * If that spot is available, we're done here. + */ + if (next != tgts[current + 1]) + break; + + /* + * Otherwise, find the next valid column after + * the previous position. + */ + for (c = tgts[current - 1] + 1; + rm->rm_col[c].rc_error != 0; c++) + continue; + + tgts[current] = c; + current++; + + } while (current != n); + } + } + n--; +done: + for (i = 0; i < n; i++) { + zio_buf_free(orig[i], rm->rm_col[0].rc_size); + } + + return (ret); +} + static void vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc, *rc1; + raidz_col_t *rc; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int n, c, c1; + int n, c; + int tgts[VDEV_RAIDZ_MAXPARITY]; + int code; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ @@ -859,9 +1934,8 @@ vdev_raidz_io_done(zio_t *zio) * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { - switch (data_errors) { - case 0: - if (zio_checksum_error(zio) == 0) { + if (data_errors == 0) { + if (raidz_checksum_verify(zio) == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was @@ -880,9 +1954,7 @@ vdev_raidz_io_done(zio_t *zio) } goto done; } - break; - - case 1: + } else { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we @@ -894,45 +1966,38 @@ vdev_raidz_io_done(zio_t *zio) ASSERT(parity_errors < rm->rm_firstdatacol); /* - * Find the column that reported the error. + * Identify the data columns that reported an error. */ + n = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - vdev_raidz_reconstruct_p(rm, c); - } else { - ASSERT(rm->rm_firstdatacol > 1); - vdev_raidz_reconstruct_q(rm, c); - } + ASSERT(rm->rm_firstdatacol >= n); - if (zio_checksum_error(zio) == 0) { - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) - atomic_inc_64(&raidz_corrected_p); - else - atomic_inc_64(&raidz_corrected_q); + code = vdev_raidz_reconstruct(rm, tgts, n); + + if (raidz_checksum_verify(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); /* - * If there's more than one parity disk that - * was successfully read, confirm that the - * other parity disk produced the correct data. - * This routine is suboptimal in that it - * regenerates both the parity we wish to test - * as well as the parity we just used to - * perform the reconstruction, but this should - * be a relatively uncommon case, and can be - * optimized if it becomes a problem. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. + * If we read more parity disks than were used + * for reconstruction, confirm that the other + * parity disks produced correct data. This + * routine is suboptimal in that it regenerates + * the parity that we already used in addition + * to the parity that we're attempting to + * verify, but this should be a relatively + * uncommon case, and can be optimized if it + * becomes a problem. Note that we regenerate + * parity when resilvering so we can write it + * out to failed devices later. */ - if (parity_errors < rm->rm_firstdatacol - 1 || + if (parity_errors < rm->rm_firstdatacol - n || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; @@ -942,46 +2007,6 @@ vdev_raidz_io_done(zio_t *zio) goto done; } - break; - - case 2: - /* - * Two data column errors require double parity. - */ - ASSERT(rm->rm_firstdatacol == 2); - - /* - * Find the two columns that reported errors. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - for (c1 = c++; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - vdev_raidz_reconstruct_pq(rm, c1, c); - - if (zio_checksum_error(zio) == 0) { - atomic_inc_64(&raidz_corrected_pq); - goto done; - } - break; - - default: - ASSERT(rm->rm_firstdatacol <= 2); - ASSERT(0); } } @@ -1020,145 +2045,54 @@ vdev_raidz_io_done(zio_t *zio) * errors we detected, and we've attempted to read all columns. There * must, therefore, be one or more additional problems -- silent errors * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. Before we attempt combinatorial reconstruction make - * sure we have a chance of coming up with the right answer. + * in absent data. We check if there is enough additional data to + * possibly reconstruct the data and then perform combinatorial + * reconstruction over all possible combinations. If that fails, + * we're cooked. */ - if (total_errors >= rm->rm_firstdatacol) { + if (total_errors > rm->rm_firstdatacol) { zio->io_error = vdev_raidz_worst_error(rm); - /* - * If there were exactly as many device errors as parity - * columns, yet we couldn't reconstruct the data, then at - * least one device must have returned bad data silently. - */ - if (total_errors == rm->rm_firstdatacol) - zio->io_error = zio_worst_error(zio->io_error, ECKSUM); - goto done; - } - - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - /* - * Attempt to reconstruct the data from parity P. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_p(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_p); - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + } else if (total_errors < rm->rm_firstdatacol && + (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { /* - * Attempt to reconstruct the data from parity Q. + * If we didn't use all the available parity for the + * combinatorial reconstruction, verify that the remaining + * parity is correct. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_q(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_q); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - if (rm->rm_firstdatacol > 1 && - rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && - rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + if (code != (1 << rm->rm_firstdatacol) - 1) + (void) raidz_parity_verify(zio, rm); + } else { /* - * Attempt to reconstruct the data from both P and Q. + * We're here because either: + * + * total_errors == rm_first_datacol, or + * vdev_raidz_combrec() failed + * + * In either case, there is enough bad data to prevent + * reconstruction. + * + * Start checksum ereports for all children which haven't + * failed, and the IO wasn't speculative. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { - void *orig, *orig1; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - - for (c1 = c + 1; c1 < rm->rm_cols; c1++) { - rc1 = &rm->rm_col[c1]; - - orig1 = zio_buf_alloc(rc1->rc_size); - bcopy(rc1->rc_data, orig1, rc1->rc_size); - - vdev_raidz_reconstruct_pq(rm, c, c1); + zio->io_error = ECKSUM; - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - zio_buf_free(orig1, rc1->rc_size); - atomic_inc_64(&raidz_corrected_pq); - - /* - * If these children didn't know they - * returned bad data, inform them. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - if (rc1->rc_tried && rc1->rc_error == 0) - raidz_checksum_error(zio, rc1); - - rc->rc_error = ECKSUM; - rc1->rc_error = ECKSUM; - - goto done; + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error == 0) { + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = + rm->rm_ecksuminjected; + + zfs_ereport_start_checksum( + zio->io_spa, + vd->vdev_child[rc->rc_devidx], + zio, rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); } - - bcopy(orig1, rc1->rc_data, rc1->rc_size); - zio_buf_free(orig1, rc1->rc_size); } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - /* - * All combinations failed to checksum. Generate checksum ereports for - * all children. - */ - zio->io_error = ECKSUM; - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, - rc->rc_offset, rc->rc_size); } } @@ -1205,6 +2139,8 @@ vdev_ops_t vdev_raidz_ops = { vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, + NULL, + NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c index 88383f0..879f78f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors) static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - int c; int lasterror = 0; int numerrors = 0; @@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - int error; - if ((error = vdev_open(cvd)) != 0 && - !cvd->vdev_islog) { - lasterror = error; + if (cvd->vdev_open_error && !cvd->vdev_islog) { + lasterror = cvd->vdev_open_error; numerrors++; - continue; } } @@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_root_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } @@ -113,6 +109,8 @@ vdev_ops_t vdev_root_ops = { NULL, /* io_start - not applicable to the root */ NULL, /* io_done - not applicable to the root */ vdev_root_state_change, + NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 7abe63a..288a4d9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -19,13 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - - /* * This file contains the top half of the zfs directory structure * implementation. The bottom half is in zap_leaf.c. @@ -45,11 +41,11 @@ #include #include #include +#include #include #include #include #include -#include int fzap_default_block_shift = 14; /* 16k blocksize */ @@ -73,7 +69,7 @@ fzap_byteswap(void *vbuf, size_t size) } void -fzap_upgrade(zap_t *zap, dmu_tx_t *tx) +fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) { dmu_buf_t *db; zap_leaf_t *l; @@ -86,7 +82,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, &zap->zap_f.zap_phys, zap_evict); - mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0); + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; zp = zap->zap_f.zap_phys; @@ -105,6 +101,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) zp->zap_num_entries = 0; zp->zap_salt = zap->zap_salt; zp->zap_normflags = zap->zap_normflags; + zp->zap_flags = flags; /* block 1 will be the first leaf */ for (i = 0; i < (1<zap_ptrtbl.zt_shift); i++) @@ -114,7 +111,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) * set up block 1 - the first leaf */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - 1<zt_blks_copied; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + b) << bs, FTAG, &db_old); + (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err) return (err); /* first half of entries in old[b] go to new[2*b+0] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+0) << bs, FTAG, &db_new)); + (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+1) << bs, FTAG, &db_new)); + (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, db_new->db_data, hepb); @@ -236,7 +233,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, off = idx & ((1<<(bs-3))-1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs, FTAG, &db); + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); dmu_buf_will_dirty(db, tx); @@ -248,7 +245,8 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, dmu_buf_t *db2; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk2) << bs, FTAG, &db2); + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, + DMU_READ_NO_PREFETCH); if (err) { dmu_buf_rele(db, FTAG); return (err); @@ -279,7 +277,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) off = idx & ((1<<(bs-3))-1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs, FTAG, &db); + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); *valp = ((uint64_t *)db->db_data)[off]; @@ -294,7 +292,8 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) blk = (idx*2) >> (bs-3); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk) << bs, FTAG, &db); + (tbl->zt_nextblk + blk) << bs, FTAG, &db, + DMU_READ_NO_PREFETCH); dmu_buf_rele(db, FTAG); } return (err); @@ -318,8 +317,13 @@ zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) static int zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) { - /* In case things go horribly wrong. */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2) + /* + * The pointer table should never use more hash bits than we + * have (otherwise we'd be using useless zero bits to index it). + * If we are within 2 bits of running out, stop growing, since + * this is already an aberrant condition. + */ + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) return (ENOSPC); if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { @@ -338,7 +342,8 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) newblk = zap_allocate_blocks(zap, 1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new); + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, + DMU_READ_NO_PREFETCH); if (err) return (err); dmu_buf_will_dirty(db_new, tx); @@ -389,14 +394,15 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0); + rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; l->l_phys = NULL; VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf)); + l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, + DMU_READ_NO_PREFETCH)); winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); @@ -447,7 +453,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) ASSERT(blkid != 0); l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); - rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0); + rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit(db->db_size)-1; @@ -499,7 +505,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - blkid << bs, NULL, &db); + blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); if (err) return (err); @@ -703,13 +709,17 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) } } - static int -fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) +fzap_checkname(zap_name_t *zn) { - if (name && strlen(name) > ZAP_MAXNAMELEN) - return (E2BIG); + if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + return (0); +} +static int +fzap_checksize(uint64_t integer_size, uint64_t num_integers) +{ /* Only integer sizes supported by C */ switch (integer_size) { case 1: @@ -727,6 +737,16 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) return (0); } +static int +fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) +{ + int err; + + if ((err = fzap_checkname(zn)) != 0) + return (err); + return (fzap_checksize(integer_size, num_integers)); +} + /* * Routines for manipulating attributes. */ @@ -739,8 +759,7 @@ fzap_lookup(zap_name_t *zn, int err; zap_entry_handle_t zeh; - err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); - if (err != 0) + if ((err = fzap_checkname(zn)) != 0) return (err); err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); @@ -748,8 +767,13 @@ fzap_lookup(zap_name_t *zn, return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { + if ((err = fzap_checksize(integer_size, num_integers)) != 0) { + zap_put_leaf(l); + return (err); + } + err = zap_entry_read(&zeh, integer_size, num_integers, buf); - (void) zap_entry_read_name(&zeh, rn_len, realname); + (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); if (ncp) { *ncp = zap_entry_normalization_conflict(&zeh, zn, NULL, zn->zn_zap); @@ -772,8 +796,7 @@ fzap_add_cd(zap_name_t *zn, ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); - ASSERT(fzap_checksize(zn->zn_name_orij, - integer_size, num_integers) == 0); + ASSERT(fzap_check(zn, integer_size, num_integers) == 0); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) @@ -787,7 +810,7 @@ retry: if (err != ENOENT) goto out; - err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd, + err = zap_entry_create(l, zn, cd, integer_size, num_integers, val, &zeh); if (err == 0) { @@ -810,12 +833,12 @@ fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { - int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); + int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, - val, ZAP_MAXCD, tx)); + val, ZAP_NEED_CD, tx)); } int @@ -828,7 +851,7 @@ fzap_update(zap_name_t *zn, zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); + err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); @@ -841,8 +864,8 @@ retry: ASSERT(err == 0 || err == ENOENT); if (create) { - err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, - ZAP_MAXCD, integer_size, num_integers, val, &zeh); + err = zap_entry_create(l, zn, ZAP_NEED_CD, + integer_size, num_integers, val, &zeh); if (err == 0) zap_increment_num_entries(zap, 1, tx); } else { @@ -904,6 +927,21 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx) return (err); } +void +fzap_prefetch(zap_name_t *zn) +{ + uint64_t idx, blk; + zap_t *zap = zn->zn_zap; + int bs; + + idx = ZAP_HASH_IDX(zn->zn_hash, + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + bs = FZAP_BLOCK_SHIFT(zap); + dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); +} + /* * Helper functions for consumers. */ @@ -955,6 +993,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) } int +zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + err = zap_add(os, intoobj, za.za_name, + 8, 1, &value, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int +zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + uint64_t delta = 0; + + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + + err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); + if (err != 0 && err != ENOENT) + return (err); + delta += za.za_first_integer; + err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; @@ -981,6 +1069,56 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) return (zap_lookup(os, obj, name, 8, 1, &value)); } +int +zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_lookup(os, obj, name, 8, 1, valuep)); +} + +int +zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx) +{ + uint64_t value = 0; + int err; + + if (delta == 0) + return (0); + + err = zap_lookup(os, obj, name, 8, 1, &value); + if (err != 0 && err != ENOENT) + return (err); + value += delta; + if (value == 0) + err = zap_remove(os, obj, name, tx); + else + err = zap_update(os, obj, name, 8, 1, &value, tx); + return (err); +} + +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_increment(os, obj, name, delta, tx)); +} + /* * Routines for iterating over the attributes. */ @@ -1042,7 +1180,7 @@ again: err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); ASSERT(err == 0 || err == EOVERFLOW); } - err = zap_entry_read_name(&zeh, + err = zap_entry_read_name(zap, &zeh, sizeof (za->za_name), za->za_name); ASSERT(err == 0); @@ -1054,7 +1192,6 @@ again: return (err); } - static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { @@ -1081,6 +1218,31 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) } } +int +fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) +{ + int err; + zap_leaf_t *l; + zap_entry_handle_t zeh; + + if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + return (err); + + zc->zc_leaf = l; + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + + return (err); +} + void fzap_get_stats(zap_t *zap, zap_stats_t *zs) { @@ -1126,7 +1288,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, - FTAG, &db); + FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c index da498b6..19a795d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c @@ -19,24 +19,24 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * The 512-byte leaf is broken into 32 16-byte chunks. * chunk number n means l_chunk[n], even though the header precedes it. * the names are stored null-terminated. */ +#include +#include +#include #include +#include #include #include #include -#include -#include +#include static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); @@ -127,12 +127,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) le = &lc->l_entry; le->le_type = BSWAP_8(le->le_type); - le->le_int_size = BSWAP_8(le->le_int_size); + le->le_value_intlen = BSWAP_8(le->le_value_intlen); le->le_next = BSWAP_16(le->le_next); le->le_name_chunk = BSWAP_16(le->le_name_chunk); - le->le_name_length = BSWAP_16(le->le_name_length); + le->le_name_numints = BSWAP_16(le->le_name_numints); le->le_value_chunk = BSWAP_16(le->le_value_chunk); - le->le_value_length = BSWAP_16(le->le_value_length); + le->le_value_numints = BSWAP_16(le->le_value_numints); le->le_cd = BSWAP_32(le->le_cd); le->le_hash = BSWAP_64(le->le_hash); break; @@ -215,7 +215,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) static uint16_t zap_leaf_array_create(zap_leaf_t *l, const char *buf, - int integer_size, int num_integers) + int integer_size, int num_integers) { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; @@ -273,11 +273,12 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) static void zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, - char *buf) + void *buf) { int len = MIN(array_len, buf_len); int byten = 0; uint64_t value = 0; + char *p = buf; ASSERT3U(array_int_len, <=, buf_int_len); @@ -285,7 +286,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, if (array_int_len == 8 && buf_int_len == 8 && len == 1) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; uint8_t *ip = la->la_array; - uint64_t *buf64 = (uint64_t *)buf; + uint64_t *buf64 = buf; *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | @@ -300,8 +301,8 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, while (chunk != CHAIN_END) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES); - buf += ZAP_LEAF_ARRAY_BYTES; + bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); + p += ZAP_LEAF_ARRAY_BYTES; chunk = la->la_next; } return; @@ -316,50 +317,69 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { - stv(buf_int_len, buf, value); + stv(buf_int_len, p, value); byten = 0; len--; if (len == 0) return; - buf += buf_int_len; + p += buf_int_len; } } chunk = la->la_next; } } -/* - * Only to be used on 8-bit arrays. - * array_len is actual len in bytes (not encoded le_value_length). - * namenorm is null-terminated. - */ static boolean_t -zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len) +zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, + int chunk, int array_numints) { int bseen = 0; + if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { + uint64_t *thiskey; + boolean_t match; + + ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); + thiskey = kmem_alloc(array_numints * sizeof (*thiskey), + KM_SLEEP); + + zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, + sizeof (*thiskey), array_numints, thiskey); + match = bcmp(thiskey, zn->zn_key_orig, + array_numints * sizeof (*thiskey)) == 0; + kmem_free(thiskey, array_numints * sizeof (*thiskey)); + return (match); + } + + ASSERT(zn->zn_key_intlen == 1); if (zn->zn_matchtype == MT_FIRST) { - char *thisname = kmem_alloc(array_len, KM_SLEEP); + char *thisname = kmem_alloc(array_numints, KM_SLEEP); boolean_t match; - zap_leaf_array_read(l, chunk, 1, array_len, 1, - array_len, thisname); + zap_leaf_array_read(l, chunk, sizeof (char), array_numints, + sizeof (char), array_numints, thisname); match = zap_match(zn, thisname); - kmem_free(thisname, array_len); + kmem_free(thisname, array_numints); return (match); } - /* Fast path for exact matching */ - while (bseen < array_len) { + /* + * Fast path for exact matching. + * First check that the lengths match, so that we don't read + * past the end of the zn_key_orig array. + */ + if (array_numints != zn->zn_key_orig_numints) + return (B_FALSE); + while (bseen < array_numints) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); + int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread)) + if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) break; chunk = la->la_next; bseen += toread; } - return (bseen == array_len); + return (bseen == array_numints); } /* @@ -394,9 +414,9 @@ again: ASSERT(zn->zn_matchtype == MT_EXACT || (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); if (zap_leaf_array_match(l, zn, le->le_name_chunk, - le->le_name_length)) { - zeh->zeh_num_integers = le->le_value_length; - zeh->zeh_integer_size = le->le_int_size; + le->le_name_numints)) { + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; @@ -427,7 +447,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l, { uint16_t chunk; uint64_t besth = -1ULL; - uint32_t bestcd = ZAP_MAXCD; + uint32_t bestcd = -1U; uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; uint16_t lh; struct zap_leaf_entry *le; @@ -449,8 +469,8 @@ zap_leaf_lookup_closest(zap_leaf_t *l, besth = le->le_hash; bestcd = le->le_cd; - zeh->zeh_num_integers = le->le_value_length; - zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_fakechunk = chunk; @@ -460,7 +480,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l, } } - return (bestcd == ZAP_MAXCD ? ENOENT : 0); + return (bestcd == -1U ? ENOENT : 0); } int @@ -471,11 +491,12 @@ zap_entry_read(const zap_entry_handle_t *zeh, ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - if (le->le_int_size > integer_size) + if (le->le_value_intlen > integer_size) return (EINVAL); - zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size, - le->le_value_length, integer_size, num_integers, buf); + zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, + le->le_value_intlen, le->le_value_numints, + integer_size, num_integers, buf); if (zeh->zeh_num_integers > num_integers) return (EOVERFLOW); @@ -484,15 +505,21 @@ zap_entry_read(const zap_entry_handle_t *zeh, } int -zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) +zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, + char *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, - le->le_name_length, 1, buflen, buf); - if (le->le_name_length > buflen) + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, + le->le_name_numints, 8, buflen / 8, buf); + } else { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, + le->le_name_numints, 1, buflen, buf); + } + if (le->le_name_numints > buflen) return (EOVERFLOW); return (0); } @@ -506,24 +533,16 @@ zap_entry_update(zap_entry_handle_t *zeh, struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size); + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) return (EAGAIN); - /* - * We should search other chained leaves (via - * zap_entry_remove,create?) otherwise returning EAGAIN will - * just send us into an infinite loop if we have to chain - * another leaf block, rather than being able to split this - * block. - */ - zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_length = num_integers; - le->le_int_size = integer_size; + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; return (0); } @@ -550,26 +569,25 @@ zap_entry_remove(zap_entry_handle_t *zeh) } int -zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, +zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint8_t integer_size, uint64_t num_integers, const void *buf, zap_entry_handle_t *zeh) { uint16_t chunk; uint16_t *chunkp; struct zap_leaf_entry *le; - uint64_t namelen, valuelen; + uint64_t valuelen; int numchunks; + uint64_t h = zn->zn_hash; valuelen = integer_size * num_integers; - namelen = strlen(name) + 1; - ASSERT(namelen >= 2); - numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) + - ZAP_LEAF_ARRAY_NCHUNKS(valuelen); + numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (E2BIG); - if (cd == ZAP_MAXCD) { + if (cd == ZAP_NEED_CD) { /* find the lowest unused cd */ if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { cd = 0; @@ -586,7 +604,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, } } else { /* old unsorted format; do it the O(n^2) way */ - for (cd = 0; cd < ZAP_MAXCD; cd++) { + for (cd = 0; ; cd++) { for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); @@ -601,10 +619,10 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, } } /* - * we would run out of space in a block before we could - * have ZAP_MAXCD entries + * We would run out of space in a block before we could + * store enough entries to run out of CD values. */ - ASSERT3U(cd, <, ZAP_MAXCD); + ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); } if (l->l_phys->l_hdr.lh_nfree < numchunks) @@ -614,12 +632,13 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, chunk = zap_leaf_chunk_alloc(l); le = ZAP_LEAF_ENTRY(l, chunk); le->le_type = ZAP_CHUNK_ENTRY; - le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen); - le->le_name_length = namelen; + le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, + zn->zn_key_intlen, zn->zn_key_orig_numints); + le->le_name_numints = zn->zn_key_orig_numints; le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_length = num_integers; - le->le_int_size = integer_size; + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; le->le_hash = h; le->le_cd = cd; @@ -631,7 +650,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, zeh->zeh_leaf = l; zeh->zeh_num_integers = num_integers; - zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; @@ -673,7 +692,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, allocdzn = B_TRUE; } if (zap_leaf_array_match(zeh->zeh_leaf, zn, - le->le_name_chunk, le->le_name_length)) { + le->le_name_chunk, le->le_name_numints)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); @@ -836,9 +855,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, chunk); - n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) + - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * - le->le_int_size); + n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * + le->le_value_intlen); n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_entries_using_n_chunks[n]++; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index 9453fd2..b403097 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - +#include #include #include #include @@ -33,38 +31,98 @@ #include #include #include +#include #ifdef _KERNEL #include #endif -static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); +static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); +uint64_t +zap_getflags(zap_t *zap) +{ + if (zap->zap_ismicro) + return (0); + return (zap->zap_u.zap_fat.zap_phys->zap_flags); +} + +int +zap_hashbits(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return (48); + else + return (28); +} + +uint32_t +zap_maxcd(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return ((1<<16)-1); + else + return (-1U); +} static uint64_t -zap_hash(zap_t *zap, const char *normname) +zap_hash(zap_name_t *zn) { - const uint8_t *cp; - uint8_t c; - uint64_t crc = zap->zap_salt; + zap_t *zap = zn->zn_zap; + uint64_t h = 0; - /* NB: name must already be normalized, if necessary */ + if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { + ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); + h = *(uint64_t *)zn->zn_key_orig; + } else { + h = zap->zap_salt; + ASSERT(h != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + int i; + const uint64_t *wp = zn->zn_key_norm; + + ASSERT(zn->zn_key_intlen == 8); + for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { + int j; + uint64_t word = *wp; + + for (j = 0; j < zn->zn_key_intlen; j++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ word) & 0xFF]; + word >>= NBBY; + } + } + } else { + int i, len; + const uint8_t *cp = zn->zn_key_norm; - ASSERT(crc != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; - } + /* + * We previously stored the terminating null on + * disk, but didn't hash it, so we need to + * continue to not hash it. (The + * zn_key_*_numints includes the terminating + * null for non-binary keys.) + */ + len = zn->zn_key_norm_numints - 1; + ASSERT(zn->zn_key_intlen == 1); + for (i = 0; i < len; cp++, i++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ *cp) & 0xFF]; + } + } + } /* - * Only use 28 bits, since we need 4 bits in the cookie for the - * collision differentiator. We MUST use the high bits, since - * those are the ones that we first pay attention to when + * Don't use all 64 bits, since we need some in the cookie for + * the collision differentiator. We MUST use the high bits, + * since those are the ones that we first pay attention to when * chosing the bucket. */ - crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); - return (crc); + return (h); } static int @@ -73,13 +131,15 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm) size_t inlen, outlen; int err; + ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); + inlen = strlen(name) + 1; outlen = ZAP_MAXNAMELEN; err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, - zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, - &err); + zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL | + U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } @@ -87,16 +147,18 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm) boolean_t zap_match(zap_name_t *zn, const char *matchname) { + ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); + if (zn->zn_matchtype == MT_FIRST) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm) != 0) return (B_FALSE); - return (strcmp(zn->zn_name_norm, norm) == 0); + return (strcmp(zn->zn_key_norm, norm) == 0); } else { /* MT_BEST or MT_EXACT */ - return (strcmp(zn->zn_name_orij, matchname) == 0); + return (strcmp(zn->zn_key_orig, matchname) == 0); } } @@ -106,30 +168,49 @@ zap_name_free(zap_name_t *zn) kmem_free(zn, sizeof (zap_name_t)); } -/* XXX combine this with zap_lockdir()? */ zap_name_t * -zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) +zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zn->zn_zap = zap; - zn->zn_name_orij = name; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = key; + zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; if (zap->zap_normflags) { - if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { + if (zap_normalize(zap, key, zn->zn_normbuf) != 0) { zap_name_free(zn); return (NULL); } - zn->zn_name_norm = zn->zn_normbuf; + zn->zn_key_norm = zn->zn_normbuf; + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != MT_EXACT) { zap_name_free(zn); return (NULL); } - zn->zn_name_norm = zn->zn_name_orij; + zn->zn_key_norm = zn->zn_key_orig; + zn->zn_key_norm_numints = zn->zn_key_orig_numints; } - zn->zn_hash = zap_hash(zap, zn->zn_name_norm); + zn->zn_hash = zap_hash(zn); + return (zn); +} + +zap_name_t * +zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + ASSERT(zap->zap_normflags == 0); + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = zn->zn_key_norm = key; + zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; + zn->zn_matchtype = MT_EXACT; + + zn->zn_hash = zap_hash(zn); return (zn); } @@ -174,27 +255,27 @@ mze_compare(const void *arg1, const void *arg2) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); - if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + if (mze1->mze_cd > mze2->mze_cd) return (+1); - if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static int -mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; avl_index_t idx; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(mzep->mze_cd < ZAP_MAXCD); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; - mze->mze_phys = *mzep; + mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; + ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) { kmem_free(mze, sizeof (mzap_ent_t)); return (EEXIST); @@ -214,18 +295,16 @@ mze_find(zap_name_t *zn) ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) - return (NULL); - mze_tofind.mze_hash = zn->zn_hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { - if (zap_match(zn, mze->mze_phys.mze_name)) + ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); + if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } if (zn->zn_matchtype == MT_BEST) { @@ -248,12 +327,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (mze->mze_phys.mze_cd != cd) + if (mze->mze_cd != cd) break; cd++; } @@ -292,15 +371,14 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); - rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0); + rw_init(&zap->zap_rwlock, 0, 0, 0); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; zap->zap_dbuf = db; if (*(uint64_t *)db->db_data != ZBT_MICRO) { - mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, - MUTEX_DEFAULT, 0); + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; } else { zap->zap_ismicro = TRUE; @@ -337,7 +415,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); - if (mze_insert(zap, i, zn->zn_hash, mze) == 0) + if (mze_insert(zap, i, zn->zn_hash) == 0) zap->zap_m.zap_num_entries++; else { printf("ZFS WARNING: Duplicated ZAP " @@ -385,7 +463,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, *zapp = NULL; - err = dmu_buf_hold(os, obj, 0, NULL, &db); + err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH); if (err) return (err); @@ -435,7 +513,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; - return (mzap_upgrade(zapp, tx)); + return (mzap_upgrade(zapp, tx, 0)); } err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); ASSERT3U(err, ==, 0); @@ -455,10 +533,11 @@ zap_unlockdir(zap_t *zap) } static int -mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) +mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; - int i, sz, nchunks, err; + int i, sz, nchunks; + int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -468,11 +547,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; - err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, - 1ULL << fzap_default_block_shift, 0, tx); - if (err) { - kmem_free(mzp, sz); - return (err); + if (!flags) { + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << fzap_default_block_shift, 0, tx); + if (err) { + kmem_free(mzp, sz); + return (err); + } } dprintf("upgrading obj=%llu with %u chunks\n", @@ -480,10 +561,9 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); - fzap_upgrade(zap, tx); + fzap_upgrade(zap, tx, flags); for (i = 0; i < nchunks; i++) { - int err; mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; zap_name_t *zn; if (mze->mze_name[0] == 0) @@ -503,12 +583,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) } static void -mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) +mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, + dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; - VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); + VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); #ifdef ZFS_DEBUG { @@ -524,6 +605,15 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; dmu_buf_rele(db, FTAG); + + if (flags != 0) { + zap_t *zap; + /* Only fat zap supports flags; upgrade immediately. */ + VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, + B_FALSE, B_FALSE, &zap)); + VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); + zap_unlockdir(zap); + } } int @@ -544,7 +634,7 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); - mzap_create_impl(os, obj, normflags, tx); + mzap_create_impl(os, obj, normflags, 0, tx); return (0); } @@ -561,7 +651,26 @@ zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); - mzap_create_impl(os, obj, normflags, tx); + mzap_create_impl(os, obj, normflags, 0, tx); + return (obj); +} + +uint64_t +zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + + ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && + leaf_blockshift <= SPA_MAXBLOCKSHIFT && + indirect_blockshift >= SPA_MINBLOCKSHIFT && + indirect_blockshift <= SPA_MAXBLOCKSHIFT); + + VERIFY(dmu_object_set_blocksize(os, obj, + 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); + + mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } @@ -631,11 +740,11 @@ again: other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { - zn = zap_name_alloc(zap, mze->mze_phys.mze_name, + zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_FIRST); allocdzn = B_TRUE; } - if (zap_match(zn, other->mze_phys.mze_name)) { + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); @@ -697,9 +806,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, } else if (integer_size != 8) { err = EINVAL; } else { - *(uint64_t *)buf = mze->mze_phys.mze_value; + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, - mze->mze_phys.mze_name, rn_len); + MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); @@ -713,6 +823,63 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, } int +zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +int +zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + + err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +int +zap_contains(objset_t *os, uint64_t zapobj, const char *name) +{ + int err = (zap_lookup_norm(os, zapobj, name, 0, + 0, NULL, MT_EXACT, NULL, 0, NULL)); + if (err == EOVERFLOW || err == EINVAL) + err = 0; /* found, but skipped reading the value */ + return (err); +} + +int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { @@ -747,6 +914,28 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + static void mzap_addent(zap_name_t *zn, uint64_t value) { @@ -755,20 +944,18 @@ mzap_addent(zap_name_t *zn, uint64_t value) int start = zap->zap_m.zap_alloc_next; uint32_t cd; - dprintf("obj=%llu %s=%llu\n", zap->zap_object, - zn->zn_name_orij, value); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; - ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); + ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ - ASSERT(cd != ZAP_MAXCD); + ASSERT(cd < zap_maxcd(zap)); again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { @@ -776,13 +963,13 @@ again: if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; - (void) strcpy(mze->mze_name, zn->zn_name_orij); + (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; - VERIFY(0 == mze_insert(zap, i, zn->zn_hash, mze)); + VERIFY(0 == mze_insert(zap, i, zn->zn_hash)); return; } } @@ -794,7 +981,7 @@ again: } int -zap_add(objset_t *os, uint64_t zapobj, const char *name, +zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { @@ -807,7 +994,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); - zn = zap_name_alloc(zap, name, MT_EXACT); + zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); @@ -816,10 +1003,8 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx); + strlen(key) >= MZAP_NAME_LEN) { + err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ @@ -839,15 +1024,50 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, } int +zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap); + return (err); +} + +int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; mzap_ent_t *mze; + uint64_t oldval; const uint64_t *intval = val; zap_name_t *zn; int err; +#ifdef ZFS_DEBUG + /* + * If there is an old value, it shouldn't change across the + * lockdir (eg, due to bprewrite's xlation). + */ + if (integer_size == 8 && num_integers == 1) + (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); +#endif + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); @@ -863,7 +1083,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx); + err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_update(zn, integer_size, num_integers, val, tx); @@ -871,9 +1091,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } else { mze = mze_find(zn); if (mze != NULL) { - mze->mze_phys.mze_value = *intval; - zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid].mze_value = *intval; + ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); + MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } @@ -886,6 +1105,31 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + zap_name_t *zn; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_update(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap); + return (err); +} + +int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); @@ -926,17 +1170,32 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + /* * Routines for iterating over the attributes. */ -/* - * We want to keep the high 32 bits of the cursor zero if we can, so - * that 32-bit programs can access this. So use a small hash value so - * we can fit 4 bits of cd into the 32-bit cursor. - * - * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] - */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) @@ -945,15 +1204,9 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; - if (serialized == -1ULL) { - zc->zc_hash = -1ULL; - zc->zc_cd = 0; - } else { - zc->zc_hash = serialized << (64-ZAP_HASHBITS); - zc->zc_cd = serialized >> ZAP_HASHBITS; - if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ - zc->zc_cd = 0; - } + zc->zc_serialized = serialized; + zc->zc_hash = 0; + zc->zc_cd = 0; } void @@ -983,10 +1236,21 @@ zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); - ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); - ASSERT(zc->zc_cd < ZAP_MAXCD); - return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | - ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); + if (zc->zc_zap == NULL) + return (zc->zc_serialized); + ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); + ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); + + /* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So usually use a small + * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits + * of the cursor. + * + * [ collision differentiator | zap_hashbits()-bit hash value ] + */ + return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | + ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int @@ -1001,10 +1265,23 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) return (ENOENT); if (zc->zc_zap == NULL) { + int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); + + /* + * To support zap_cursor_init_serialized, advance, retrieve, + * we must add to the existing zc_cd, which may already + * be 1 due to the zap_cursor_advance. + */ + ASSERT(zc->zc_hash == 0); + hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = zc->zc_serialized << (64 - hb); + zc->zc_cd += zc->zc_serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } @@ -1014,7 +1291,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) err = ENOENT; mze_tofind.mze_hash = zc->zc_hash; - mze_tofind.mze_phys.mze_cd = zc->zc_cd; + mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { @@ -1022,18 +1299,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) idx, AVL_AFTER); } if (mze) { - ASSERT(0 == bcmp(&mze->mze_phys, - &zc->zc_zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid], sizeof (mze->mze_phys))); - + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; - za->za_first_integer = mze->mze_phys.mze_value; - (void) strcpy(za->za_name, mze->mze_phys.mze_name); + za->za_first_integer = mzep->mze_value; + (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; @@ -1049,12 +1324,46 @@ zap_cursor_advance(zap_cursor_t *zc) if (zc->zc_hash == -1ULL) return; zc->zc_cd++; - if (zc->zc_cd >= ZAP_MAXCD) { - zc->zc_cd = 0; - zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); - if (zc->zc_hash == 0) /* EOF */ - zc->zc_hash = -1ULL; +} + +int +zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) +{ + int err = 0; + mzap_ent_t *mze; + zap_name_t *zn; + + if (zc->zc_zap == NULL) { + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, &zc->zc_zap); + if (err) + return (err); + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + } + + zn = zap_name_alloc(zc->zc_zap, name, mt); + if (zn == NULL) { + rw_exit(&zc->zc_zap->zap_rwlock); + return (ENOTSUP); + } + + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_move_to_key(zc, zn); + } else { + mze = mze_find(zn); + if (mze == NULL) { + err = ENOENT; + goto out; + } + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_cd; } + +out: + zap_name_free(zn); + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); } int diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c index fc25bfe..f893383 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -48,6 +47,7 @@ #include #include #include +#include #include #define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE @@ -71,8 +71,7 @@ #define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ ACE_DELETE|ACE_DELETE_CHILD) -#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ - ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) @@ -319,6 +318,117 @@ static acl_ops_t zfs_acl_fuid_ops = { zfs_ace_fuid_data }; +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + static int zfs_acl_version(int version) { @@ -334,7 +444,7 @@ zfs_acl_version_zp(znode_t *zp) return (zfs_acl_version(zp->z_zfsvfs->z_version)); } -static zfs_acl_t * +zfs_acl_t * zfs_acl_alloc(int vers) { zfs_acl_t *aclp; @@ -350,7 +460,7 @@ zfs_acl_alloc(int vers) return (aclp); } -static zfs_acl_node_t * +zfs_acl_node_t * zfs_acl_node_alloc(size_t bytes) { zfs_acl_node_t *aclnode; @@ -461,6 +571,8 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, { zfs_acl_node_t *aclnode; + ASSERT(aclp); + if (start == NULL) { aclnode = list_head(&aclp->z_acl); if (aclnode == NULL) @@ -507,6 +619,7 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, *who = aclp->z_ops.ace_who_get(acep); aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; aclnode->z_ace_idx++; + return ((void *)acep); } return (NULL); @@ -540,7 +653,7 @@ zfs_acl_curr_node(zfs_acl_t *aclp) */ int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, - void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) { int i; @@ -771,8 +884,9 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, * Determine mode of file based on ACL. * Also, create FUIDs for any User/Group ACEs */ -static uint64_t -zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) { int entry_type; mode_t mode; @@ -783,7 +897,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) uint32_t access_mask; boolean_t an_exec_denied = B_FALSE; - mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { @@ -801,7 +915,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) entry_type == OWNING_GROUP)) continue; - if (entry_type == ACE_OWNER) { + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRUSR))) { seen |= S_IRUSR; @@ -823,7 +938,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) mode |= S_IXUSR; } } - } else if (entry_type == OWNING_GROUP) { + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRGRP))) { seen |= S_IRGRP; @@ -928,61 +1044,29 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) an_exec_denied = B_TRUE; if (an_exec_denied) - zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED; + *pflags &= ~ZFS_NO_EXECS_DENIED; else - zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED; + *pflags |= ZFS_NO_EXECS_DENIED; return (mode); } -static zfs_acl_t * -zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) -{ - zfs_acl_t *aclp; - zfs_acl_node_t *aclnode; - - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - - /* - * Version 0 to 1 znode_acl_phys has the size/count fields swapped. - * Version 0 didn't have a size field, only a count. - */ - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size; - aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count); - } else { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size; - } - - aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0); - aclnode->z_ace_count = aclp->z_acl_count; - if (will_modify) { - bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata, - aclp->z_acl_bytes); - } else { - aclnode->z_size = aclp->z_acl_bytes; - aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0]; - } - - list_insert_head(&aclp->z_acl, aclnode); - - return (aclp); -} - /* * Read an external acl object. If the intent is to modify, always * create a new acl and leave any cached acl in place. */ static int -zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) { - uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; zfs_acl_t *aclp; - size_t aclsize; - size_t acl_count; + int aclsize; + int acl_count; zfs_acl_node_t *aclnode; - int error; + zfs_acl_phys_t znode_acl; + int version; + int error; + boolean_t drop_lock = B_FALSE; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); @@ -991,46 +1075,97 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) return (0); } - if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { - *aclpp = zfs_acl_node_read_internal(zp, will_modify); - if (!will_modify) - zp->z_acl_cached = *aclpp; - return (0); + /* + * close race where znode could be upgrade while trying to + * read the znode attributes. + * + * But this could only happen if the file isn't already an SA + * znode + */ + if (!zp->z_is_sa && !have_lock) { + mutex_enter(&zp->z_lock); + drop_lock = B_TRUE; } + version = zfs_znode_acl_version(zp); - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - zfs_acl_phys_v0_t *zacl0 = - (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl; - - aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count); - acl_count = zacl0->z_acl_count; - } else { - aclsize = zp->z_phys->zp_acl.z_acl_size; - acl_count = zp->z_phys->zp_acl.z_acl_count; - if (aclsize == 0) - aclsize = acl_count * sizeof (zfs_ace_t); + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; } - aclnode = zfs_acl_node_alloc(aclsize); - list_insert_head(&aclp->z_acl, aclnode); - error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - aclsize, aclnode->z_acldata, DMU_READ_PREFETCH); - aclnode->z_ace_count = acl_count; + + aclp = zfs_acl_alloc(version); + aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + if (error != 0) { zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); /* convert checksum errors into IO errors */ if (error == ECKSUM) error = EIO; - return (error); + goto done; } + list_insert_head(&aclp->z_acl, aclnode); + *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; - return (0); +done: + if (drop_lock) + mutex_exit(&zp->z_lock); + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + return (error); } /* @@ -1043,28 +1178,35 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) int zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { - int error; - znode_phys_t *zphys = zp->z_phys; - zfs_acl_phys_t *zacl = &zphys->zp_acl; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; - uint64_t off = 0; - dmu_object_type_t otype; - zfs_acl_node_t *aclnode; + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); - dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } - zphys->zp_mode = zfs_mode_compute(zp, aclp); - /* - * Decide which object type to use. If we are forced to - * use old ACL format then transform ACL into zfs_oldace_t - * layout. + * Upgrade needed? */ if (!zfsvfs->z_use_fuids) { otype = DMU_OT_OLDACL; @@ -1076,84 +1218,113 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) otype = DMU_OT_ACL; } - if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - /* - * If ACL was previously external and we are now - * converting to new ACL format then release old - * ACL object and create a new one. - */ - if (aoid && aclp->z_version != zacl->z_acl_version) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - aoid = 0; - } - if (aoid == 0) { - aoid = dmu_object_alloc(zfsvfs->z_os, - otype, aclp->z_acl_bytes, - otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, - otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx); + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + zfs_acl_phys_t acl_phys; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } } else { - (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, - aclp->z_acl_bytes, 0, tx); - } - zphys->zp_acl.z_acl_extern_obj = aoid; - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); - off += aclnode->z_size; + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } } - } else { - void *start = zacl->z_ace_data; /* - * Migrating back embedded? + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. */ - if (zphys->zp_acl.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - zphys->zp_acl.z_acl_extern_obj = 0; - } - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - bcopy(aclnode->z_acldata, start, aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; } - } + acl_phys.z_acl_version = aclp->z_version; - /* - * If Old version then swap count/bytes to match old - * layout of znode_acl_phys_t. - */ - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - zphys->zp_acl.z_acl_size = aclp->z_acl_count; - zphys->zp_acl.z_acl_count = aclp->z_acl_bytes; - } else { - zphys->zp_acl.z_acl_size = aclp->z_acl_bytes; - zphys->zp_acl.z_acl_count = aclp->z_acl_count; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); } - zphys->zp_acl.z_acl_version = aclp->z_version; - /* * Replace ACL wide bits, but first clear them. */ - zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS; + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; - zp->z_phys->zp_flags |= aclp->z_hints; + zp->z_pflags |= aclp->z_hints; if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) - zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; + zp->z_pflags |= ZFS_ACL_TRIVIAL; - return (0); + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } /* @@ -1223,314 +1394,64 @@ zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep, aclp->z_ops.ace_mask_set(acep, acepmask); } -/* - * Apply mode to canonical six ACEs. - */ -static void -zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) -{ - zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); - void *acep; - int maskoff = aclp->z_ops.ace_mask_off(); - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - - ASSERT(aclnode != NULL); - - acep = (void *)((caddr_t)aclnode->z_acldata + - aclnode->z_size - (abstract_size * 6)); - - /* - * Fixup final ACEs to match the mode - */ - - adjust_ace_pair_common(acep, maskoff, abstract_size, - (mode & 0700) >> 6); /* owner@ */ - - acep = (caddr_t)acep + (abstract_size * 2); - - adjust_ace_pair_common(acep, maskoff, abstract_size, - (mode & 0070) >> 3); /* group@ */ - - acep = (caddr_t)acep + (abstract_size * 2); - adjust_ace_pair_common(acep, maskoff, - abstract_size, mode); /* everyone@ */ -} - - -static int -zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny, - int entry_type, int accessmask) -{ - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - uint16_t type = aclp->z_ops.ace_type_get(acep); - uint16_t flags = aclp->z_ops.ace_flags_get(acep); - - return (mask == accessmask && type == allow_deny && - ((flags & ACE_TYPE_FLAGS) == entry_type)); -} - -/* - * Can prepended ACE be reused? - */ -static int -zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep) -{ - int okay_masks; - uint16_t prevtype; - uint16_t prevflags; - uint16_t flags; - uint32_t mask, prevmask; - - if (prevacep == NULL) - return (B_FALSE); - - prevtype = aclp->z_ops.ace_type_get(prevacep); - prevflags = aclp->z_ops.ace_flags_get(prevacep); - flags = aclp->z_ops.ace_flags_get(acep); - mask = aclp->z_ops.ace_mask_get(acep); - prevmask = aclp->z_ops.ace_mask_get(prevacep); - - if (prevtype != DENY) - return (B_FALSE); - - if (prevflags != (flags & ACE_IDENTIFIER_GROUP)) - return (B_FALSE); - - okay_masks = (mask & OKAY_MASK_BITS); - - if (prevmask & ~okay_masks) - return (B_FALSE); - - return (B_TRUE); -} - - -/* - * Insert new ACL node into chain of zfs_acl_node_t's - * - * This will result in two possible results. - * 1. If the ACL is currently just a single zfs_acl_node and - * we are prepending the entry then current acl node will have - * a new node inserted above it. - * - * 2. If we are inserting in the middle of current acl node then - * the current node will be split in two and new node will be inserted - * in between the two split nodes. - */ -static zfs_acl_node_t * -zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) -{ - zfs_acl_node_t *newnode; - zfs_acl_node_t *trailernode = NULL; - zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp); - int curr_idx = aclp->z_curr_node->z_ace_idx; - int trailer_count; - size_t oldsize; - - newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep)); - newnode->z_ace_count = 1; - - oldsize = currnode->z_size; - - if (curr_idx != 1) { - trailernode = zfs_acl_node_alloc(0); - trailernode->z_acldata = acep; - - trailer_count = currnode->z_ace_count - curr_idx + 1; - currnode->z_ace_count = curr_idx - 1; - currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata; - trailernode->z_size = oldsize - currnode->z_size; - trailernode->z_ace_count = trailer_count; - } - - aclp->z_acl_count += 1; - aclp->z_acl_bytes += aclp->z_ops.ace_size(acep); - - if (curr_idx == 1) - list_insert_before(&aclp->z_acl, currnode, newnode); - else - list_insert_after(&aclp->z_acl, currnode, newnode); - if (trailernode) { - list_insert_after(&aclp->z_acl, newnode, trailernode); - aclp->z_curr_node = trailernode; - trailernode->z_ace_idx = 1; - } - - return (newnode); -} - -/* - * Prepend deny ACE - */ -static void * -zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep, - mode_t mode) -{ - zfs_acl_node_t *aclnode; - void *newacep; - uint64_t fuid; - uint16_t flags; - - aclnode = zfs_acl_ace_insert(aclp, acep); - newacep = aclnode->z_acldata; - fuid = aclp->z_ops.ace_who_get(acep); - flags = aclp->z_ops.ace_flags_get(acep); - zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid); - - return (newacep); -} - -/* - * Split an inherited ACE into inherit_only ACE - * and original ACE with inheritance flags stripped off. - */ static void -zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep) +zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) { - zfs_acl_node_t *aclnode; - zfs_acl_node_t *currnode; - void *newacep; - uint16_t type, flags; - uint32_t mask; - uint64_t fuid; - - type = aclp->z_ops.ace_type_get(acep); - flags = aclp->z_ops.ace_flags_get(acep); - mask = aclp->z_ops.ace_mask_get(acep); - fuid = aclp->z_ops.ace_who_get(acep); - - aclnode = zfs_acl_ace_insert(aclp, acep); - newacep = aclnode->z_acldata; - - aclp->z_ops.ace_type_set(newacep, type); - aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE); - aclp->z_ops.ace_mask_set(newacep, mask); - aclp->z_ops.ace_type_set(newacep, type); - aclp->z_ops.ace_who_set(newacep, fuid); - aclp->z_next_ace = acep; - flags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep, flags); - currnode = zfs_acl_curr_node(aclp); - ASSERT(currnode->z_ace_idx >= 1); - currnode->z_ace_idx -= 1; -} - -/* - * Are ACES started at index i, the canonical six ACES? - */ -static int -zfs_have_canonical_six(zfs_acl_t *aclp) -{ - void *acep; - zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); - int i = 0; - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - - ASSERT(aclnode != NULL); - - if (aclnode->z_ace_count < 6) - return (0); - - acep = (void *)((caddr_t)aclnode->z_acldata + - aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6)); - - if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - DENY, ACE_OWNER, 0) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, - OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep + - (abstract_size * i++), - ALLOW, OWNING_GROUP, 0) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) { - return (1); - } else { - return (0); - } -} - - -/* - * Apply step 1g, to group entries - * - * Need to deal with corner case where group may have - * greater permissions than owner. If so then limit - * group permissions, based on what extra permissions - * group has. - */ -static void -zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, - mode_t mode) -{ - uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep); - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep); - mode_t extramode = (mode >> 3) & 07; - mode_t ownermode = (mode >> 6); - - if (prevflags & ACE_IDENTIFIER_GROUP) { - - extramode &= ~ownermode; - - if (extramode) { - if (extramode & S_IROTH) { - prevmask &= ~ACE_READ_DATA; - mask &= ~ACE_READ_DATA; - } - if (extramode & S_IWOTH) { - prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - } - if (extramode & S_IXOTH) { - prevmask &= ~ACE_EXECUTE; - mask &= ~ACE_EXECUTE; - } - } - } - aclp->z_ops.ace_mask_set(acep, mask); - aclp->z_ops.ace_mask_set(prevacep, prevmask); -} - -/* - * Apply the chmod algorithm as described - * in PSARC/2002/240 - */ -static void -zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid, - uint64_t mode, zfs_acl_t *aclp) -{ - void *acep = NULL, *prevacep = NULL; + void *acep = NULL; uint64_t who; - int i; + int new_count, new_bytes; + int ace_size; int entry_type; - int reuse_deny; - int need_canonical_six = 1; uint16_t iflags, type; uint32_t access_mask; - - /* - * If discard then just discard all ACL nodes which - * represent the ACEs. - * - * New owner@/group@/everone@ ACEs will be added - * later. - */ - if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) - zfs_acl_release_nodes(aclp); + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; + uint32_t owner, group, everyone; + uint32_t deny1, deny2, allow0; + + new_count = new_bytes = 0; + + acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2, + &owner, &group, &everyone); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (allow0) { + zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } if (deny1) { + zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (deny2) { + zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { + uint16_t inherit_flags; entry_type = (iflags & ACE_TYPE_FLAGS); - iflags = (iflags & ALL_INHERIT); + inherit_flags = (iflags & ALL_INHERIT); + + if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + (entry_type == OWNING_GROUP)) && + ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) { + continue; + } if ((type != ALLOW && type != DENY) || - (iflags & ACE_INHERIT_ONLY_ACE)) { - if (iflags) + (inherit_flags & ACE_INHERIT_ONLY_ACE)) { + if (inherit_flags) aclp->z_hints |= ZFS_INHERIT_ACE; switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: @@ -1540,116 +1461,58 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid, aclp->z_hints |= ZFS_ACL_OBJ_ACE; break; } - goto nextace; - } - - /* - * Need to split ace into two? - */ - if ((iflags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) && - (!(iflags & ACE_INHERIT_ONLY_ACE))) { - zfs_acl_split_ace(aclp, acep); - aclp->z_hints |= ZFS_INHERIT_ACE; - goto nextace; - } - - if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - (entry_type == OWNING_GROUP)) { - access_mask &= ~OGE_CLEAR; - aclp->z_ops.ace_mask_set(acep, access_mask); - goto nextace; } else { - reuse_deny = B_TRUE; - if (type == ALLOW) { - - /* - * Check preceding ACE if any, to see - * if we need to prepend a DENY ACE. - * This is only applicable when the acl_mode - * property == groupmask. - */ - if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { - - reuse_deny = zfs_reuse_deny(aclp, acep, - prevacep); - - if (!reuse_deny) { - prevacep = - zfs_acl_prepend_deny(uid, - aclp, acep, mode); - } else { - zfs_acl_prepend_fixup( - aclp, prevacep, - acep, mode, uid); - } - zfs_fixup_group_entries(aclp, acep, - prevacep, mode); - } - } - } -nextace: - prevacep = acep; - } - - /* - * Check out last six aces, if we have six. - */ - if (aclp->z_acl_count >= 6) { - if (zfs_have_canonical_six(aclp)) { - need_canonical_six = 0; + /* + * Limit permissions to be no greater than + * group permissions + */ + if (type == ALLOW && zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) { + if (!(mode & S_IRGRP)) + access_mask &= ~ACE_READ_DATA; + if (!(mode & S_IWGRP)) + access_mask &= + ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + if (!(mode & S_IXGRP)) + access_mask &= ~ACE_EXECUTE; + access_mask &= + ~(ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS); + } } - } - - if (need_canonical_six) { - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - void *zacep; - zfs_acl_node_t *aclnode = - zfs_acl_node_alloc(abstract_size * 6); - - aclnode->z_size = abstract_size * 6; - aclnode->z_ace_count = 6; - aclp->z_acl_bytes += aclnode->z_size; - list_insert_tail(&aclp->z_acl, aclnode); - - zacep = aclnode->z_acldata; - - i = 0; - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - 0, DENY, -1, ACE_OWNER); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, - DENY, -1, OWNING_GROUP); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, - ALLOW, -1, OWNING_GROUP); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE); - aclp->z_acl_count += 6; - } - - zfs_acl_fixup_canonical_six(aclp, mode); + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops.ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); } -int +void zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { - int error; - - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); - *aclp = NULL; - error = zfs_acl_node_read(zp, aclp, B_TRUE); - if (error == 0) { - (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS; - zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp); - } - mutex_exit(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp); mutex_exit(&zp->z_lock); - return (error); + mutex_exit(&zp->z_acl_lock); + ASSERT(*aclp); } /* @@ -1691,8 +1554,8 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, uint64_t mode, boolean_t *need_chmod) { void *pacep; - void *acep, *acep2; - zfs_acl_node_t *aclnode, *aclnode2; + void *acep; + zfs_acl_node_t *aclnode; zfs_acl_t *aclp = NULL; uint64_t who; uint32_t access_mask; @@ -1714,7 +1577,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, *need_chmod = B_TRUE; pacep = NULL; aclp = zfs_acl_alloc(paclp->z_version); - if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD) + if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || vtype == VLNK) return (aclp); while (pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type)) { @@ -1743,11 +1606,11 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, OWNING_GROUP)) && (vreg || (vdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) { *need_chmod = B_FALSE; + } - if (!vdir && passthrough_x && - ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { - access_mask &= ~ACE_EXECUTE; - } + if (!vdir && passthrough_x && + ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; } aclnode = zfs_acl_node_alloc(ace_size); @@ -1765,6 +1628,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, &data2)) == data1sz); bcopy(data1, data2, data2sz); } + aclp->z_acl_count++; aclnode->z_ace_count++; aclp->z_acl_bytes += aclnode->z_size; @@ -1783,38 +1647,17 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, ASSERT(vdir); - newflags = aclp->z_ops.ace_flags_get(acep); + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ if ((iflags & (ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) != - ACE_FILE_INHERIT_ACE) { - aclnode2 = zfs_acl_node_alloc(ace_size); - list_insert_tail(&aclp->z_acl, aclnode2); - acep2 = aclnode2->z_acldata; - zfs_set_ace(aclp, acep2, - access_mask, type, who, - iflags|ACE_INHERITED_ACE); + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { newflags |= ACE_INHERIT_ONLY_ACE; - aclp->z_ops.ace_flags_set(acep, newflags); - newflags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep2, + aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); - - /* - * Copy special opaque data if any - */ - if ((data1sz = aclp->z_ops.ace_data(acep, - &data1)) != 0) { - VERIFY((data2sz = - aclp->z_ops.ace_data(acep2, - &data2)) == data1sz); - bcopy(data1, data2, data1sz); - } - aclp->z_acl_count++; - aclnode2->z_ace_count++; - aclp->z_acl_bytes += aclnode->z_size; - zfs_restricted_update(zfsvfs, aclp, acep2); } else { - newflags |= ACE_INHERIT_ONLY_ACE; + newflags &= ~ACE_INHERIT_ONLY_ACE; aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } @@ -1835,6 +1678,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, zfs_acl_t *paclp; gid_t gid; boolean_t need_chmod = B_TRUE; + boolean_t inherited = B_FALSE; bzero(acl_ids, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -1843,7 +1687,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) return (error); - /* * Determine uid and gid. */ @@ -1865,21 +1708,36 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; - if (acl_ids->z_fgid != dzp->z_phys->zp_gid && + if (acl_ids->z_fgid != dzp->z_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; } if (acl_ids->z_fgid == 0) { - if (dzp->z_phys->zp_mode & S_ISGID) { - acl_ids->z_fgid = dzp->z_phys->zp_gid; + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } } else { acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, ZFS_GROUP, cr, &acl_ids->z_fuidp); #ifdef __FreeBSD__ - gid = acl_ids->z_fgid = dzp->z_phys->zp_gid; + gid = acl_ids->z_fgid = dzp->z_gid; #else gid = crgetgid(cr); #endif @@ -1894,7 +1752,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, * file's new group, clear the file's set-GID bit. */ - if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) && + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && (vap->va_type == VDIR)) { acl_ids->z_mode |= S_ISGID; } else { @@ -1904,28 +1762,38 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, } if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); mutex_enter(&dzp->z_lock); if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && - (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) && - !(dzp->z_phys->zp_flags & ZFS_XATTR)) { - mutex_enter(&dzp->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); - mutex_exit(&dzp->z_acl_lock); + (dzp->z_pflags & ZFS_INHERIT_ACE)) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } mutex_exit(&dzp->z_lock); + mutex_exit(&dzp->z_acl_lock); if (need_chmod) { - acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ? + acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ? ZFS_ACL_AUTO_INHERIT : 0; - zfs_acl_chmod(zfsvfs, acl_ids->z_fuid, - acl_ids->z_mode, acl_ids->z_aclp); + zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp); } } + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + return (0); } @@ -1946,8 +1814,8 @@ zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) boolean_t zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) { - return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || - zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); + return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); } /* @@ -1965,15 +1833,15 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); - if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) - return (error); - if (mask == 0) return (ENOSYS); + if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) + return (error); + mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp, B_FALSE); + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); @@ -1982,8 +1850,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) /* * Scan ACL to determine number of ACEs */ - if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) && - !(mask & VSA_ACE_ALLTYPES)) { + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { void *zacep = NULL; uint64_t who; uint32_t access_mask; @@ -2004,7 +1871,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) } vsecp->vsa_aclcnt = count; } else - count = aclp->z_acl_count; + count = (int)aclp->z_acl_count; if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = count; @@ -2038,11 +1905,11 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) } if (mask & VSA_ACE_ACLFLAGS) { vsecp->vsa_aclflags = 0; - if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED) + if (zp->z_pflags & ZFS_ACL_DEFAULTED) vsecp->vsa_aclflags |= ACL_DEFAULTED; - if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED) + if (zp->z_pflags & ZFS_ACL_PROTECTED) vsecp->vsa_aclflags |= ACL_PROTECTED; - if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT) + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } @@ -2120,11 +1987,12 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; boolean_t fuid_dirtied; + uint64_t acl_obj; if (mask == 0) return (ENOSYS); - if (zp->z_phys->zp_flags & ZFS_IMMUTABLE) + if (zp->z_pflags & ZFS_IMMUTABLE) return (EPERM); if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) @@ -2140,37 +2008,41 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) * existing flags. */ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { - aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL? */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - zp->z_phys->zp_acl.z_acl_version == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); } else { - dmu_tx_hold_write(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, aclp->z_acl_bytes); + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); @@ -2188,20 +2060,20 @@ top: error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); done: - mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); return (error); } @@ -2226,15 +2098,15 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) */ if ((v4_mode & WRITE_MASK_DATA) && (((ZTOV(zp)->v_type != VDIR) && - (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || (ZTOV(zp)->v_type == VDIR && - (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { + (zp->z_pflags & ZFS_IMMUTABLE)))) { return (EPERM); } #ifdef sun if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && - (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { + (zp->z_pflags & ZFS_NOUNLINK)) { return (EPERM); } #else @@ -2244,13 +2116,13 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) * handled in zfs_zaccess_delete(). */ if ((v4_mode & ACE_DELETE) && - (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { + (zp->z_pflags & ZFS_NOUNLINK)) { return (EPERM); } #endif if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && - (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { + (zp->z_pflags & ZFS_AV_QUARANTINED))) { return (EACCES); } @@ -2297,19 +2169,21 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; - uid_t fowner; uid_t gowner; + uid_t fowner; zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp, B_FALSE); + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } + ASSERT(zp->z_acl_cached); + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { uint32_t mask_matched; @@ -2409,18 +2283,10 @@ zfs_has_access(znode_t *zp, cred_t *cr) uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { - uid_t owner; - - owner = zfs_fuid_map_id(zp->z_zfsvfs, - zp->z_phys->zp_uid, cr, ZFS_OWNER); + uid_t owner; - return ( - secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 || - secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 || - secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 || - secpolicy_vnode_chown(ZTOV(zp), cr, owner) == 0 || - secpolicy_vnode_setdac(ZTOV(zp), cr, owner) == 0 || - secpolicy_vnode_remove(ZTOV(zp), cr) == 0); + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); } return (B_TRUE); } @@ -2478,38 +2344,33 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) boolean_t owner = B_FALSE; boolean_t groupmbr = B_FALSE; boolean_t is_attr; - uid_t fowner; - uid_t gowner; uid_t uid = crgetuid(cr); int error; - if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + if (zdp->z_pflags & ZFS_AV_QUARANTINED) return (EACCES); - is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) && + is_attr = ((zdp->z_pflags & ZFS_XATTR) && (ZTOV(zdp)->v_type == VDIR)); if (is_attr) goto slow; + mutex_enter(&zdp->z_acl_lock); - if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) { + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { mutex_exit(&zdp->z_acl_lock); return (0); } - if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 || - FUID_INDEX(zdp->z_phys->zp_gid) != 0) { + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { mutex_exit(&zdp->z_acl_lock); goto slow; } - fowner = (uid_t)zdp->z_phys->zp_uid; - gowner = (uid_t)zdp->z_phys->zp_gid; - - if (uid == fowner) { + if (uid == zdp->z_uid) { owner = B_TRUE; - if (zdp->z_phys->zp_mode & S_IXUSR) { + if (zdp->z_mode & S_IXUSR) { mutex_exit(&zdp->z_acl_lock); return (0); } else { @@ -2517,9 +2378,9 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) goto slow; } } - if (groupmember(gowner, cr)) { + if (groupmember(zdp->z_gid, cr)) { groupmbr = B_TRUE; - if (zdp->z_phys->zp_mode & S_IXGRP) { + if (zdp->z_mode & S_IXGRP) { mutex_exit(&zdp->z_acl_lock); return (0); } else { @@ -2528,7 +2389,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) } } if (!owner && !groupmbr) { - if (zdp->z_phys->zp_mode & S_IXOTH) { + if (zdp->z_mode & S_IXOTH) { mutex_exit(&zdp->z_acl_lock); return (0); } @@ -2545,8 +2406,9 @@ slow: } /* - * Determine whether Access should be granted/denied, invoking least - * priv subsytem when a deny is determined. + * Determine whether Access should be granted/denied. + * The least priv subsytem is always consulted as a basic privilege + * can define any form of access. */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) @@ -2554,13 +2416,13 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) uint32_t working_mode; int error; int is_attr; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; boolean_t check_privs; znode_t *xzp; znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; - is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && - (ZTOV(zp)->v_type == VDIR)); + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); #ifdef __FreeBSD__ /* @@ -2568,15 +2430,22 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) * Note that not checking them is not just an optimization - without * this shortcut, EA operations may bogusly fail with EACCES. */ - if (zp->z_phys->zp_flags & ZFS_XATTR) + if (zp->z_pflags & ZFS_XATTR) return (0); #else /* * If attribute then validate against base file */ if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + if ((error = zfs_zget(zp->z_zfsvfs, - zp->z_phys->zp_parent, &xzp)) != 0) { + parent, &xzp)) != 0) { return (error); } @@ -2598,11 +2467,36 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } #endif + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, &check_privs, skipaclchk, cr)) == 0) { if (is_attr) VN_RELE(ZTOV(xzp)); - return (0); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); } if (error && !check_privs) { @@ -2616,12 +2510,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } if (error && check_privs) { - uid_t owner; mode_t checkmode = 0; - owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr, - ZFS_OWNER); - /* * First check for implicit owner permission on * read_acl/read_attributes @@ -2643,9 +2533,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) if (working_mode & ACE_EXECUTE) checkmode |= VEXEC; - if (checkmode) - error = secpolicy_vnode_access(cr, ZTOV(check_zp), - owner, checkmode); + error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, + needed_bits & ~checkmode, needed_bits); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); @@ -2668,8 +2557,12 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) error = EACCES; } } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); } + if (is_attr) VN_RELE(ZTOV(xzp)); @@ -2699,15 +2592,15 @@ zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) static int zfs_delete_final_check(znode_t *zp, znode_t *dzp, - mode_t missing_perms, cred_t *cr) + mode_t available_perms, cred_t *cr) { int error; uid_t downer; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER); + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); - error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms); + error = secpolicy_vnode_access2(cr, ZTOV(dzp), + downer, available_perms, VWRITE|VEXEC); if (error == 0) error = zfs_sticky_remove_access(dzp, zp, cr); @@ -2756,7 +2649,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; int dzp_error, zp_error; - mode_t missing_perms; + mode_t available_perms; boolean_t dzpcheck_privs = B_TRUE; boolean_t zpcheck_privs = B_TRUE; @@ -2774,7 +2667,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) * to determine what was found. */ - if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) return (EPERM); /* @@ -2817,23 +2710,20 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) * only need to see if we have write/execute on directory. */ - if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) - return (zfs_sticky_remove_access(dzp, zp, cr)); + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); - if (!dzpcheck_privs) + if (dzp_error != 0 && !dzpcheck_privs) return (dzp_error); /* * Fourth row */ - missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0; - missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0; - - ASSERT(missing_perms); + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; - return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); } @@ -2844,7 +2734,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, int add_perm; int error; - if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + if (szp->z_pflags & ZFS_AV_QUARANTINED) return (EACCES); add_perm = (ZTOV(szp)->v_type == VDIR) ? diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c index cd36696..acf632b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,6 +27,7 @@ #include #include #include +#include #include void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c index 48c3ebf..7372ee7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -110,17 +110,41 @@ snapentry_compare(const void *a, const void *b) return (0); } +#ifdef sun +vnodeops_t *zfsctl_ops_root; +vnodeops_t *zfsctl_ops_snapdir; +vnodeops_t *zfsctl_ops_snapshot; +vnodeops_t *zfsctl_ops_shares; +vnodeops_t *zfsctl_ops_shares_dir; + +static const fs_operation_def_t zfsctl_tops_root[]; +static const fs_operation_def_t zfsctl_tops_snapdir[]; +static const fs_operation_def_t zfsctl_tops_snapshot[]; +static const fs_operation_def_t zfsctl_tops_shares[]; +#else /* !sun */ static struct vop_vector zfsctl_ops_root; static struct vop_vector zfsctl_ops_snapdir; static struct vop_vector zfsctl_ops_snapshot; static struct vop_vector zfsctl_ops_shares; static struct vop_vector zfsctl_ops_shares_dir; +#endif /* !sun */ static vnode_t *zfsctl_mknode_snapdir(vnode_t *); static vnode_t *zfsctl_mknode_shares(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); +#ifdef sun +static gfs_opsvec_t zfsctl_opsvec[] = { + { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, + { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, + { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, + { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir }, + { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares }, + { NULL } +}; +#endif /* sun */ + /* * Root directory elements. We only have two entries * snapshot and shares. @@ -144,11 +168,35 @@ static gfs_dirent_t zfsctl_root_entries[] = { void zfsctl_init(void) { +#ifdef sun + VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); +#endif } void zfsctl_fini(void) { +#ifdef sun + /* + * Remove vfsctl vnode ops + */ + if (zfsctl_ops_root) + vn_freevnodeops(zfsctl_ops_root); + if (zfsctl_ops_snapdir) + vn_freevnodeops(zfsctl_ops_snapdir); + if (zfsctl_ops_snapshot) + vn_freevnodeops(zfsctl_ops_snapshot); + if (zfsctl_ops_shares) + vn_freevnodeops(zfsctl_ops_shares); + if (zfsctl_ops_shares_dir) + vn_freevnodeops(zfsctl_ops_shares_dir); + + zfsctl_ops_root = NULL; + zfsctl_ops_snapdir = NULL; + zfsctl_ops_snapshot = NULL; + zfsctl_ops_shares = NULL; + zfsctl_ops_shares_dir = NULL; +#endif /* sun */ } boolean_t @@ -191,6 +239,7 @@ zfsctl_create(zfsvfs_t *zfsvfs) { vnode_t *vp, *rvp; zfsctl_node_t *zcp; + uint64_t crtime[2]; ASSERT(zfsvfs->z_ctldir == NULL); @@ -201,7 +250,9 @@ zfsctl_create(zfsvfs_t *zfsvfs) zcp->zc_id = ZFSCTL_INO_ROOT; VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); - ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof (crtime))); + ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime); VN_URELE(rvp); /* @@ -273,12 +324,12 @@ static int zfsctl_common_access(ap) struct vop_access_args /* { struct vnode *a_vp; - int a_accmode; + accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { - int mode = ap->a_accmode; + accmode_t accmode = ap->a_accmode; #ifdef TODO if (flags & V_ACE_MASK) { @@ -286,8 +337,8 @@ zfsctl_common_access(ap) return (EACCES); } else { #endif - if (mode & VWRITE) - return (EACCES); + if (accmode & VWRITE) + return (EACCES); #ifdef TODO } #endif @@ -301,14 +352,13 @@ zfsctl_common_access(ap) static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { - zfsctl_node_t *zcp = vp->v_data; timestruc_t now; vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = 0; /* - * We are a purly virtual object, so we have no + * We are a purely virtual object, so we have no * blocksize or allocated blocks. */ vap->va_blksize = 0; @@ -323,7 +373,6 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) */ gethrestime(&now); vap->va_atime = now; - vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime; /* FreeBSD: Reset chflags(2) flags. */ vap->va_flags = 0; } @@ -363,6 +412,7 @@ zfsctl_common_fid(ap) return (0); } + /*ARGSUSED*/ static int zfsctl_shares_fid(ap) @@ -436,16 +486,18 @@ zfsctl_root_getattr(ap) struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; - struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_node_t *zcp = vp->v_data; ZFS_ENTER(zfsvfs); vap->va_nodeid = ZFSCTL_INO_ROOT; vap->va_nlink = vap->va_size = NROOT_ENTRIES; + vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; + vap->va_birthtime = vap->va_ctime; zfsctl_common_getattr(vp, vap); ZFS_EXIT(zfsvfs); @@ -453,6 +505,40 @@ zfsctl_root_getattr(ap) return (0); } +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + /* + * No extended attributes allowed under .zfs + */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + + if (strcmp(nm, "..") == 0) { + err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp); + if (err == 0) + VOP_UNLOCK(*vpp, 0); + } else { + err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp); + } + + ZFS_EXIT(zfsvfs); + + return (err); +} + #ifdef sun static int zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, @@ -493,40 +579,6 @@ static const fs_operation_def_t zfsctl_tops_root[] = { */ /* ARGSUSED */ int -zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - int err; - - /* - * No extended attributes allowed under .zfs - */ - if (flags & LOOKUP_XATTR) - return (EINVAL); - - ZFS_ENTER(zfsvfs); - - if (strcmp(nm, "..") == 0) { - err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp); - if (err == 0) - VOP_UNLOCK(*vpp, 0); - } else { - err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, - cr, ct, direntflags, realpnp); - } - - ZFS_EXIT(zfsvfs); - - return (err); -} - -/* - * Special case the handling of "..". - */ -/* ARGSUSED */ -int zfsctl_freebsd_root_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; @@ -551,7 +603,6 @@ zfsctl_freebsd_root_lookup(ap) err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL); if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - return (err); } @@ -566,6 +617,9 @@ static struct vop_vector zfsctl_ops_root = { .vop_lookup = zfsctl_freebsd_root_lookup, .vop_inactive = gfs_vop_inactive, .vop_reclaim = zfsctl_common_reclaim, +#ifdef TODO + .vop_pathconf = zfsctl_pathconf, +#endif .vop_fid = zfsctl_common_fid, }; @@ -596,10 +650,32 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) if ((error = vn_vfswlock(svp)) != 0) return (error); +#ifdef sun + VN_HOLD(svp); + error = dounmount(vn_mountedvfs(svp), fflags, cr); + if (error) { + VN_RELE(svp); + return (error); + } + + /* + * We can't use VN_RELE(), as that will try to invoke + * zfsctl_snapdir_inactive(), which would cause us to destroy + * the sd_lock mutex held by our caller. + */ + ASSERT(svp->v_count == 1); + gfs_vop_inactive(svp, cr, NULL); + + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + + return (0); +#else /* !sun */ return (dounmount(vn_mountedvfs(svp), fflags, curthread)); +#endif /* !sun */ } -#if 0 +#ifdef sun static void zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) { @@ -639,7 +715,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); - vfs_setmntpoint(vfsp, newpath); + vfs_setmntpoint(vfsp, newpath, 0); pathref = vfs_getresource(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); @@ -648,13 +724,13 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); - vfs_setresource(vfsp, newpath); + vfs_setresource(vfsp, newpath, 0); vfs_unlock(vfsp); } -#endif +#endif /* sun */ -#if 0 +#ifdef sun /*ARGSUSED*/ static int zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, @@ -717,9 +793,9 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, return (err); } -#endif +#endif /* sun */ -#if 0 +#ifdef sun /* ARGSUSED */ static int zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, @@ -769,7 +845,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, if (avl_find(&sdp->sd_snaps, sep, &where) == NULL) avl_insert(&sdp->sd_snaps, sep, where); } else - err = dmu_objset_destroy(snapname); + err = dmu_objset_destroy(snapname, B_FALSE); } else { err = ENOENT; } @@ -778,7 +854,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, return (err); } -#endif +#endif /* sun */ /* * This creates a snapshot under '.zfs/snapshot'. @@ -806,7 +882,8 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, return (err); if (err == 0) { - err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE); + err = dmu_objset_snapshot(name, dirname, NULL, NULL, + B_FALSE, B_FALSE, -1); if (err) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); @@ -951,8 +1028,7 @@ zfsctl_snapdir_lookup(ap) */ return (err == EILSEQ ? ENOENT : err); } - if (dmu_objset_open(snapname, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) { + if (dmu_objset_hold(snapname, FTAG, &snap) != 0) { mutex_exit(&sdp->sd_lock); /* Translate errors and add SAVENAME when needed. */ if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) { @@ -972,7 +1048,7 @@ zfsctl_snapdir_lookup(ap) VN_HOLD(*vpp); avl_insert(&sdp->sd_snaps, sep, where); - dmu_objset_close(snap); + dmu_objset_rele(snap, FTAG); domount: mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1; @@ -1194,6 +1270,8 @@ zfsctl_shares_getattr(ap) } ZFS_EXIT(zfsvfs); return (error); + + } /* ARGSUSED */ @@ -1203,11 +1281,10 @@ zfsctl_snapdir_getattr(ap) struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; - struct thread *a_td; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_snapdir_t *sdp = vp->v_data; @@ -1215,6 +1292,8 @@ zfsctl_snapdir_getattr(ap) zfsctl_common_getattr(vp, vap); vap->va_nodeid = gfs_file_inode(vp); vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; + vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); + vap->va_birthtime = vap->va_ctime; ZFS_EXIT(zfsvfs); return (0); @@ -1251,6 +1330,38 @@ zfsctl_snapdir_inactive(ap) return (0); } +#ifdef sun +static const fs_operation_def_t zfsctl_tops_snapdir[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, + { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, + { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, + { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, + { NULL } +}; + +static const fs_operation_def_t zfsctl_tops_shares[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } }, + { NULL } +}; +#else /* !sun */ static struct vop_vector zfsctl_ops_snapdir = { .vop_default = &default_vnodeops, .vop_open = zfsctl_common_open, @@ -1279,6 +1390,7 @@ static struct vop_vector zfsctl_ops_shares = { .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_shares_fid, }; +#endif /* !sun */ /* * pvp is the GFS vnode '.zfs/snapshot'. @@ -1347,8 +1459,8 @@ zfsctl_snapshot_inactive(ap) if (!locked) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); -end: +end: /* * Dispose of the vnode for the snapshot mount point. * This is safe to do because once this entry has been removed diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c new file mode 100644 index 0000000..d0f411a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */ + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zfs_dbgmsg_fini(void) +{ + zfs_dbgmsg_t *zdm; + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT3U(zfs_dbgmsg_size, ==, 0); +} + +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + */ +void +zfs_dbgmsg(const char *fmt, ...) +{ + int size; + va_list adx; + zfs_dbgmsg_t *zdm; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); + zdm->zdm_timestamp = gethrestime_sec(); + + va_start(adx, fmt); + (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); + va_end(adx); + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; + while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { + zdm = list_remove_head(&zfs_dbgmsgs); + size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_exit(&zfs_dbgmsgs_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c index 3ac4741..bae9071 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -52,6 +51,8 @@ #include #include #include +#include +#include #include #include @@ -286,8 +287,10 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, * See if there's an object by this name; if so, put a hold on it. */ if (flag & ZXATTR) { - zoid = dzp->z_phys->zp_xattr; - error = (zoid == 0 ? ENOENT : 0); + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); } else { if (update) vp = dnlc_lookup(ZTOV(dzp), name); @@ -379,25 +382,29 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, zfs_dirlock_t *dl; znode_t *zp; int error = 0; + uint64_t parent; if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { *vpp = ZTOV(dzp); VN_HOLD(*vpp); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ - if (dzp->z_phys->zp_parent == dzp->z_id && - zfsvfs->z_parent != zfsvfs) { + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, "snapshot", vpp, NULL, 0, NULL, kcred, NULL, NULL, NULL); return (error); } rw_enter(&dzp->z_parent_lock, RW_READER); - error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); + error = zfs_zget(zfsvfs, parent, &zp); if (error == 0) *vpp = ZTOV(zp); rw_exit(&dzp->z_parent_lock); @@ -445,7 +452,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); - ASSERT3U(zp->z_phys->zp_links, ==, 0); + ASSERT(zp->z_links == 0); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); @@ -540,10 +547,12 @@ zfs_purgedir(znode_t *dzp) (ZTOV(xzp)->v_type == VLNK)); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); - dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); @@ -576,15 +585,16 @@ zfs_rmnode(znode_t *zp) znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; + uint64_t xattr_obj; int error; - ASSERT(zp->z_phys->zp_links == 0); + ASSERT(zp->z_links == 0); /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && - (zp->z_phys->zp_flags & ZFS_XATTR)) { + (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. @@ -613,12 +623,14 @@ zfs_rmnode(znode_t *zp) * If the file has extended attributes, we're going to unlink * the xattr dir. */ - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT(error == 0); } - acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. @@ -627,11 +639,13 @@ zfs_rmnode(znode_t *zp) dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xzp) { - dmu_tx_hold_bonus(tx, xzp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* @@ -646,10 +660,12 @@ zfs_rmnode(znode_t *zp) } if (xzp) { - dmu_buf_will_dirty(xzp->z_dbuf, tx); + ASSERT(error == 0); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ - xzp->z_phys->zp_links = 0; /* no more links to it */ + xzp->z_links = 0; /* no more links to it */ + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx)); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); } @@ -667,11 +683,12 @@ out: } static uint64_t -zfs_dirent(znode_t *zp) +zfs_dirent(znode_t *zp, uint64_t mode) { uint64_t de = zp->z_id; + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) - de |= IFTODT((zp)->z_phys->zp_mode) << 60; + de |= IFTODT(mode) << 60; return (de); } @@ -682,12 +699,15 @@ int zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) { znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); uint64_t value; int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; int error; - dmu_buf_will_dirty(zp->z_dbuf, tx); mutex_enter(&zp->z_lock); if (!(flag & ZRENAMING)) { @@ -696,22 +716,47 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) mutex_exit(&zp->z_lock); return (ENOENT); } - zp->z_phys->zp_links++; + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime, B_TRUE); } - zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); - if (!(flag & ZNEW)) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); mutex_exit(&zp->z_lock); - dmu_buf_will_dirty(dzp->z_dbuf, tx); mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size++; /* one dirent added */ - dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); mutex_exit(&dzp->z_lock); - value = zfs_dirent(zp); + value = zfs_dirent(zp, zp->z_mode); error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, 8, 1, &value, tx); ASSERT(error == 0); @@ -721,6 +766,30 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) return (0); } +static int +zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && + (flag & ZCIEXACT)) || + ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && + !(flag & ZCILOOK))) + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_EXACT, tx); + else + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_FIRST, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, tx); + } + + return (error); +} + /* * Unlink zp from dl, and mark zp for deletion if this was the last link. * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). @@ -733,16 +802,18 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, boolean_t *unlinkedp) { znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; vnode_t *vp = ZTOV(zp); int zp_is_dir = (vp->v_type == VDIR); boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; int error; dnlc_remove(ZTOV(dzp), dl->dl_name); if (!(flag & ZRENAMING)) { - dmu_buf_will_dirty(zp->z_dbuf, tx); - if (vn_vfswlock(vp)) /* prevent new mounts on zp */ return (EBUSY); @@ -752,51 +823,74 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, } mutex_enter(&zp->z_lock); - if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ + + if (zp_is_dir && !zfs_dirempty(zp)) { mutex_exit(&zp->z_lock); vn_vfsunlock(vp); return (ENOTEMPTY); } - if (zp->z_phys->zp_links <= zp_is_dir) { + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) { + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + return (error); + } + + if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on vnode %p is %u, " "should be at least %u", zp->z_vnode, - (int)zp->z_phys->zp_links, + (int)zp->z_links, zp_is_dir + 1); - zp->z_phys->zp_links = zp_is_dir + 1; + zp->z_links = zp_is_dir + 1; } - if (--zp->z_phys->zp_links == zp_is_dir) { + if (--zp->z_links == zp_is_dir) { zp->z_unlinked = B_TRUE; - zp->z_phys->zp_links = 0; + zp->z_links = 0; unlinked = B_TRUE; } else { - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT(error == 0); mutex_exit(&zp->z_lock); vn_vfsunlock(vp); + } else { + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) + return (error); } - dmu_buf_will_dirty(dzp->z_dbuf, tx); mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size--; /* one dirent removed */ - dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); - mutex_exit(&dzp->z_lock); - - if (zp->z_zfsvfs->z_norm) { - if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && - (flag & ZCIEXACT)) || - ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && - !(flag & ZCILOOK))) - error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_EXACT, tx); - else - error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_FIRST, tx); - } else { - error = zap_remove(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, tx); - } + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); + mutex_exit(&dzp->z_lock); if (unlinkedp != NULL) *unlinkedp = unlinked; @@ -814,7 +908,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, boolean_t zfs_dirempty(znode_t *dzp) { - return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); + return (dzp->z_size == 2 && dzp->z_dirlocks == 0); } int @@ -826,6 +920,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; + uint64_t parent; *xvpp = NULL; @@ -846,28 +941,39 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) return (EDQUOT); } +top: tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - zfs_acl_ids_free(&acl_ids); - if (error == ERESTART) + if (error == ERESTART) { dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); return (error); } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids); + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - ASSERT(xzp->z_phys->zp_parent == zp->z_id); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zp->z_phys->zp_xattr = xzp->z_id; +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); @@ -912,7 +1018,6 @@ top: return (0); } - ASSERT(zp->z_phys->zp_xattr == 0); if (!(flags & CREATE_XATTR_DIR)) { zfs_dirent_unlock(dl); @@ -980,11 +1085,11 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) if (zdp->z_zfsvfs->z_replay) return (0); - if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) + if ((zdp->z_mode & S_ISVTX) == 0) return (0); - downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER); - fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER); + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c index 4b27ec3..0b48126 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c @@ -28,16 +28,12 @@ #include #include #include +#include #include #include #include - -#ifdef _KERNEL -/* Including sys/bus.h is just too hard, so I declare what I need here. */ -extern void devctl_notify(const char *__system, const char *__subsystem, - const char *__type, const char *__data); -#endif +#include /* * This general routine is responsible for generating all the different ZFS @@ -92,21 +88,32 @@ extern void devctl_notify(const char *__system, const char *__subsystem, * this pointer is set to NULL, and no ereport will be generated (since it * doesn't actually correspond to any particular device or piece of data, * and the caller will always retry without caching or queueing anyway). + * + * For checksum errors, we want to include more information about the actual + * error which occurs. Accordingly, we build an ereport when the error is + * noticed, but instead of sending it in immediately, we hang it off of the + * io_cksum_report field of the logical IO. When the logical IO completes + * (successfully or not), zfs_ereport_finish_checksum() is called with the + * good and bad versions of the buffer (if available), and we annotate the + * ereport with information about the differences. */ -void -zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, +#ifdef _KERNEL +static void +zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, + const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, uint64_t stateoroffset, uint64_t size) { -#ifdef _KERNEL - char buf[1024]; - struct sbuf sb; - struct timespec ts; - int error; + nvlist_t *ereport, *detector; + + uint64_t ena; + char class[64]; /* - * If we are doing a spa_tryimport(), ignore errors. + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. */ - if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) return; /* @@ -114,7 +121,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, * failed, don't bother logging any new ereports - we're just going to * get the same diagnosis anyway. */ - if (spa->spa_load_state != SPA_LOAD_NONE && + if (spa_load_state(spa) != SPA_LOAD_NONE && spa->spa_last_open_failed) return; @@ -153,9 +160,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, * not yet been asynchronously placed into the REMOVED * state. */ - if (zio->io_vd == vd && - !vdev_accessible(vd, zio) && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) return; /* @@ -169,51 +174,57 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, return; } } - nanotime(&ts); - sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); - sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return; + + if ((ereport = fm_nvlist_create(NULL)) == NULL) + return; + + if ((detector = fm_nvlist_create(NULL)) == NULL) { + fm_nvlist_destroy(ereport, FM_NVA_FREE); + return; + } /* * Serialize ereport generation */ mutex_enter(&spa->spa_errlist_lock); -#if 0 /* * Determine the ENA to use for this event. If we are in a loading * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use * a root zio-wide ENA. Otherwise, simply use a unique ENA. */ - if (spa->spa_load_state != SPA_LOAD_NONE) { -#if 0 + if (spa_load_state(spa) != SPA_LOAD_NONE) { if (spa->spa_ena == 0) spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); -#endif ena = spa->spa_ena; } else if (zio != NULL && zio->io_logical != NULL) { -#if 0 if (zio->io_logical->io_ena == 0) zio->io_logical->io_ena = fm_ena_generate(0, FM_ENA_FMT1); -#endif ena = zio->io_logical->io_ena; } else { -#if 0 ena = fm_ena_generate(0, FM_ENA_FMT1); -#else - ena = 0; -#endif } -#endif /* * Construct the full class, detector, and other standard FMA fields. */ - sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION); - sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass); + (void) snprintf(class, sizeof (class), "%s.%s", + ZFS_ERROR_CLASS, subclass); - sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION); + fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), + vd != NULL ? vd->vdev_guid : 0); + + fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); /* * Construct the per-ereport payload, depending on which parameters are @@ -223,51 +234,57 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, /* * Generic payload members common to all ereports. */ - sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)); - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, - spa_guid(spa)); - sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, - spa->spa_load_state); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, + DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + DATA_TYPE_UINT64, spa_guid(spa), + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, + spa_load_state(spa), NULL); if (spa != NULL) { - sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, + DATA_TYPE_STRING, spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? FM_EREPORT_FAILMODE_WAIT : spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? - FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC); + FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, + NULL); } if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, - vd->vdev_guid); - sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, - vd->vdev_ops->vdev_op_type); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + DATA_TYPE_UINT64, vd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); if (vd->vdev_path != NULL) - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path); + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + DATA_TYPE_STRING, vd->vdev_path, NULL); if (vd->vdev_devid != NULL) - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid); + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, + DATA_TYPE_STRING, vd->vdev_devid, NULL); if (vd->vdev_fru != NULL) - sbuf_printf(&sb, " %s=%s", - FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru); + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, + DATA_TYPE_STRING, vd->vdev_fru, NULL); if (pvd != NULL) { - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid); - sbuf_printf(&sb, " %s=%s", + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, + DATA_TYPE_UINT64, pvd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, - pvd->vdev_ops->vdev_op_type); + DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, + NULL); if (pvd->vdev_path) - sbuf_printf(&sb, " %s=%s", + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, - pvd->vdev_path); + DATA_TYPE_STRING, pvd->vdev_path, NULL); if (pvd->vdev_devid) - sbuf_printf(&sb, " %s=%s", + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, - pvd->vdev_devid); + DATA_TYPE_STRING, pvd->vdev_devid, NULL); } } @@ -275,8 +292,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, /* * Payload common to all I/Os. */ - sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, - zio->io_error); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + DATA_TYPE_INT32, zio->io_error, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a @@ -284,52 +301,500 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, * provided for us, instead of within the zio_t. */ if (vd != NULL) { - if (size) { - sbuf_printf(&sb, " %s=%ju", + if (size) + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, - stateoroffset); - sbuf_printf(&sb, " %s=%ju", - FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size); - } else { - sbuf_printf(&sb, " %s=%ju", + DATA_TYPE_UINT64, stateoroffset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, size, NULL); + else + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, - zio->io_offset); - sbuf_printf(&sb, " %s=%ju", + DATA_TYPE_UINT64, zio->io_offset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, - zio->io_size); - } + DATA_TYPE_UINT64, zio->io_size, NULL); } /* * Payload for I/Os with corresponding logical information. */ - if (zio->io_logical != NULL) { - sbuf_printf(&sb, " %s=%ju", + if (zio->io_logical != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_objset, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, - zio->io_logical->io_bookmark.zb_object); - sbuf_printf(&sb, " %s=%ju", + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_object, FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, - zio->io_logical->io_bookmark.zb_level); - sbuf_printf(&sb, " %s=%ju", + DATA_TYPE_INT64, + zio->io_logical->io_bookmark.zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, - zio->io_logical->io_bookmark.zb_blkid); - } + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_blkid, NULL); } else if (vd != NULL) { /* * If we have a vdev but no zio, this is a device fault, and the * 'stateoroffset' parameter indicates the previous state of the * vdev. */ - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, - stateoroffset); + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, + DATA_TYPE_UINT64, stateoroffset, NULL); } + mutex_exit(&spa->spa_errlist_lock); - error = sbuf_finish(&sb); - devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb)); - if (error != 0) - printf("ZFS WARNING: sbuf overflowed\n"); - sbuf_delete(&sb); + *ereport_out = ereport; + *detector_out = detector; +} + +/* if it's <= 128 bytes, save the corruption directly */ +#define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) + +#define MAX_RANGES 16 + +typedef struct zfs_ecksum_info { + /* histograms of set and cleared bits by bit number in a 64-bit word */ + uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY]; + uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; + + /* inline arrays of bits set and cleared. */ + uint64_t zei_bits_set[ZFM_MAX_INLINE]; + uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; + + /* + * for each range, the number of bits set and cleared. The Hamming + * distance between the good and bad buffers is the sum of them all. + */ + uint32_t zei_range_sets[MAX_RANGES]; + uint32_t zei_range_clears[MAX_RANGES]; + + struct zei_ranges { + uint32_t zr_start; + uint32_t zr_end; + } zei_ranges[MAX_RANGES]; + + size_t zei_range_count; + uint32_t zei_mingap; + uint32_t zei_allowed_mingap; + +} zfs_ecksum_info_t; + +static void +update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count) +{ + size_t i; + size_t bits = 0; + uint64_t value = BE_64(value_arg); + + /* We store the bits in big-endian (largest-first) order */ + for (i = 0; i < 64; i++) { + if (value & (1ull << i)) { + hist[63 - i]++; + ++bits; + } + } + /* update the count of bits changed */ + *count += bits; +} + +/* + * We've now filled up the range array, and need to increase "mingap" and + * shrink the range list accordingly. zei_mingap is always the smallest + * distance between array entries, so we set the new_allowed_gap to be + * one greater than that. We then go through the list, joining together + * any ranges which are closer than the new_allowed_gap. + * + * By construction, there will be at least one. We also update zei_mingap + * to the new smallest gap, to prepare for our next invocation. + */ +static void +shrink_ranges(zfs_ecksum_info_t *eip) +{ + uint32_t mingap = UINT32_MAX; + uint32_t new_allowed_gap = eip->zei_mingap + 1; + + size_t idx, output; + size_t max = eip->zei_range_count; + + struct zei_ranges *r = eip->zei_ranges; + + ASSERT3U(eip->zei_range_count, >, 0); + ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); + + output = idx = 0; + while (idx < max - 1) { + uint32_t start = r[idx].zr_start; + uint32_t end = r[idx].zr_end; + + while (idx < max - 1) { + idx++; + + uint32_t nstart = r[idx].zr_start; + uint32_t nend = r[idx].zr_end; + + uint32_t gap = nstart - end; + if (gap < new_allowed_gap) { + end = nend; + continue; + } + if (gap < mingap) + mingap = gap; + break; + } + r[output].zr_start = start; + r[output].zr_end = end; + output++; + } + ASSERT3U(output, <, eip->zei_range_count); + eip->zei_range_count = output; + eip->zei_mingap = mingap; + eip->zei_allowed_mingap = new_allowed_gap; +} + +static void +add_range(zfs_ecksum_info_t *eip, int start, int end) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + + if (count >= MAX_RANGES) { + shrink_ranges(eip); + count = eip->zei_range_count; + } + if (count == 0) { + eip->zei_mingap = UINT32_MAX; + eip->zei_allowed_mingap = 1; + } else { + int gap = start - r[count - 1].zr_end; + + if (gap < eip->zei_allowed_mingap) { + r[count - 1].zr_end = end; + return; + } + if (gap < eip->zei_mingap) + eip->zei_mingap = gap; + } + r[count].zr_start = start; + r[count].zr_end = end; + eip->zei_range_count++; +} + +static size_t +range_total_size(zfs_ecksum_info_t *eip) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + size_t result = 0; + size_t idx; + + for (idx = 0; idx < count; idx++) + result += (r[idx].zr_end - r[idx].zr_start); + + return (result); +} + +static zfs_ecksum_info_t * +annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, + const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, + boolean_t drop_if_identical) +{ + const uint64_t *good = (const uint64_t *)goodbuf; + const uint64_t *bad = (const uint64_t *)badbuf; + + uint64_t allset = 0; + uint64_t allcleared = 0; + + size_t nui64s = size / sizeof (uint64_t); + + size_t inline_size; + int no_inline = 0; + size_t idx; + size_t range; + + size_t offset = 0; + ssize_t start = -1; + + zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); + + /* don't do any annotation for injected checksum errors */ + if (info != NULL && info->zbc_injected) + return (eip); + + if (info != NULL && info->zbc_has_cksum) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_expected) / sizeof (uint64_t), + (uint64_t *)&info->zbc_expected, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_actual) / sizeof (uint64_t), + (uint64_t *)&info->zbc_actual, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, + DATA_TYPE_STRING, + info->zbc_checksum_name, + NULL); + + if (info->zbc_byteswapped) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, + DATA_TYPE_BOOLEAN, 1, + NULL); + } + } + + if (badbuf == NULL || goodbuf == NULL) + return (eip); + + ASSERT3U(nui64s, <=, UINT16_MAX); + ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, <=, UINT32_MAX); + + /* build up the range list by comparing the two buffers. */ + for (idx = 0; idx < nui64s; idx++) { + if (good[idx] == bad[idx]) { + if (start == -1) + continue; + + add_range(eip, start, idx); + start = -1; + } else { + if (start != -1) + continue; + + start = idx; + } + } + if (start != -1) + add_range(eip, start, idx); + + /* See if it will fit in our inline buffers */ + inline_size = range_total_size(eip); + if (inline_size > ZFM_MAX_INLINE) + no_inline = 1; + + /* + * If there is no change and we want to drop if the buffers are + * identical, do so. + */ + if (inline_size == 0 && drop_if_identical) { + kmem_free(eip, sizeof (*eip)); + return (NULL); + } + + /* + * Now walk through the ranges, filling in the details of the + * differences. Also convert our uint64_t-array offsets to byte + * offsets. + */ + for (range = 0; range < eip->zei_range_count; range++) { + size_t start = eip->zei_ranges[range].zr_start; + size_t end = eip->zei_ranges[range].zr_end; + + for (idx = start; idx < end; idx++) { + uint64_t set, cleared; + + // bits set in bad, but not in good + set = ((~good[idx]) & bad[idx]); + // bits set in good, but not in bad + cleared = (good[idx] & (~bad[idx])); + + allset |= set; + allcleared |= cleared; + + if (!no_inline) { + ASSERT3U(offset, <, inline_size); + eip->zei_bits_set[offset] = set; + eip->zei_bits_cleared[offset] = cleared; + offset++; + } + + update_histogram(set, eip->zei_histogram_set, + &eip->zei_range_sets[range]); + update_histogram(cleared, eip->zei_histogram_cleared, + &eip->zei_range_clears[range]); + } + + /* convert to byte offsets */ + eip->zei_ranges[range].zr_start *= sizeof (uint64_t); + eip->zei_ranges[range].zr_end *= sizeof (uint64_t); + } + eip->zei_allowed_mingap *= sizeof (uint64_t); + inline_size *= sizeof (uint64_t); + + /* fill in ereport */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, + DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, + (uint32_t *)eip->zei_ranges, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, + DATA_TYPE_UINT32, eip->zei_allowed_mingap, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, + NULL); + + if (!no_inline) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_cleared, + NULL); + } else { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, + DATA_TYPE_UINT16_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, + DATA_TYPE_UINT16_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, + NULL); + } + return (eip); +} +#endif + +void +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, + uint64_t stateoroffset, uint64_t size) +{ +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + + zfs_ereport_start(&ereport, &detector, + subclass, spa, vd, zio, stateoroffset, size); + + if (ereport == NULL) + return; + + fm_ereport_post(ereport, EVCH_SLEEP); + + fm_nvlist_destroy(ereport, FM_NVA_FREE); + fm_nvlist_destroy(detector, FM_NVA_FREE); +#endif +} + +void +zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, void *arg, + zio_bad_cksum_t *info) +{ + zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP); + + if (zio->io_vsd != NULL) + zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); + else + zio_vsd_default_cksum_report(zio, report, arg); + + /* copy the checksum failure information if it was provided */ + if (info != NULL) { + report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); + bcopy(info, report->zcr_ckinfo, sizeof (*info)); + } + + report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_length = length; + +#ifdef _KERNEL + zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + + if (report->zcr_ereport == NULL) { + report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); + kmem_free(report, sizeof (*report)); + return; + } +#endif + + mutex_enter(&spa->spa_errlist_lock); + report->zcr_next = zio->io_logical->io_cksum_report; + zio->io_logical->io_cksum_report = report; + mutex_exit(&spa->spa_errlist_lock); +} + +void +zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const void *good_data, const void *bad_data, boolean_t drop_if_identical) +{ +#ifdef _KERNEL + zfs_ecksum_info_t *info = NULL; + info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, + good_data, bad_data, report->zcr_length, drop_if_identical); + + if (info != NULL) + fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); + + fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE); + fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE); + report->zcr_ereport = report->zcr_detector = NULL; + + if (info != NULL) + kmem_free(info, sizeof (*info)); +#endif +} + +void +zfs_ereport_free_checksum(zio_cksum_report_t *rpt) +{ +#ifdef _KERNEL + if (rpt->zcr_ereport != NULL) { + fm_nvlist_destroy(rpt->zcr_ereport, + FM_NVA_FREE); + fm_nvlist_destroy(rpt->zcr_detector, + FM_NVA_FREE); + } +#endif + rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); + + if (rpt->zcr_ckinfo != NULL) + kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); + + kmem_free(rpt, sizeof (*rpt)); +} + +void +zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) +{ +#ifdef _KERNEL + fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); +#endif +} + +void +zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, + const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) +{ +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + zfs_ecksum_info_t *info; + + zfs_ereport_start(&ereport, &detector, + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + + if (ereport == NULL) + return; + + info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, + B_FALSE); + + if (info != NULL) + fm_ereport_post(ereport, EVCH_SLEEP); + + fm_nvlist_destroy(ereport, FM_NVA_FREE); + fm_nvlist_destroy(detector, FM_NVA_FREE); + + if (info != NULL) + kmem_free(info, sizeof (*info)); #endif } @@ -337,32 +802,28 @@ static void zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL - char buf[1024]; + nvlist_t *resource; char class[64]; - struct sbuf sb; - struct timespec ts; - int error; - nanotime(&ts); + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + return; - sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); - sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); + if ((resource = fm_nvlist_create(NULL)) == NULL) + return; - snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE, + (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, ZFS_ERROR_CLASS, name); - sbuf_printf(&sb, " %s=%d", FM_VERSION, FM_RSRC_VERSION); - sbuf_printf(&sb, " %s=%s", FM_CLASS, class); - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, - spa_guid(spa)); + VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); + VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); if (vd) - sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, - vd->vdev_guid); - error = sbuf_finish(&sb); - ZFS_LOG(1, "%s", sbuf_data(&sb)); - devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb)); - if (error != 0) - printf("ZFS WARNING: sbuf overflowed\n"); - sbuf_delete(&sb); + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); + + fm_ereport_post(resource, EVCH_SLEEP); + + fm_nvlist_destroy(resource, FM_NVA_FREE); #endif } @@ -388,3 +849,15 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); } + +/* + * The 'resource.fs.zfs.statechange' event is an internal signal that the + * given vdev has transitioned its state to DEGRADED or HEALTHY. This will + * cause the retire agent to repair any outstanding fault management cases + * open because the device was not found (fault.fs.zfs.device). + */ +void +zfs_post_state_change(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c index 8090ec1..5b54448 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #include -#include #include #include #include @@ -377,7 +375,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) rw_enter(&zfsvfs->z_fuid_lock, RW_READER); - if (zfsvfs->z_fuid_obj) + if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty) domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); else domain = nulldomain; @@ -390,10 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) void zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) { - *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid, - cr, ZFS_OWNER); - *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid, - cr, ZFS_GROUP); + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP); } uid_t @@ -418,9 +414,9 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, (void) kidmap_getgidbysid(crgetzone(cr), domain, FUID_RID(fuid), &id); } -#else /* sun */ +#else /* !sun */ id = UID_NOBODY; -#endif /* sun */ +#endif /* !sun */ return (id); } @@ -431,7 +427,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, * If ACL has multiple domains, then keep only one copy of each unique * domain. */ -static void +void zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, uint64_t idx, uint64_t id, zfs_fuid_type_t type) { @@ -492,6 +488,11 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, /* * Create a file system FUID, based on information in the users cred + * + * If cred contains KSID_OWNER then it should be used to determine + * the uid otherwise cred's uid will be used. By default cred's gid + * is used unless it's an ephemeral ID in which case KSID_GROUP will + * be used if it exists. */ uint64_t zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, @@ -506,24 +507,31 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); - if (type == ZFS_OWNER) - id = crgetuid(cr); - else - id = crgetgid(cr); + ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); + + if (!zfsvfs->z_use_fuids || (ksid == NULL)) { + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); - if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id)) return ((uint64_t)id); + } -#ifdef sun - ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); + /* + * ksid is present and FUID is supported + */ + id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr); + + if (!IS_EPHEMERAL(id)) + return ((uint64_t)id); + + if (type == ZFS_GROUP) + id = ksid_getid(ksid); - VERIFY(ksid != NULL); rid = ksid_getrid(ksid); domain = ksid_getdomain(ksid); -#else /* sun */ - rid = UID_NOBODY; - domain = nulldomain; -#endif /* sun */ + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); @@ -597,7 +605,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, }; domain = fuidp->z_domain_table[idx -1]; } else { -#ifdef sun if (type == ZFS_OWNER || type == ZFS_ACE_USER) status = kidmap_getsidbyuid(crgetzone(cr), id, &domain, &rid); @@ -606,7 +613,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, &domain, &rid); if (status != 0) { -#endif /* sun */ /* * When returning nobody we will need to * make a dummy fuid table entry for logging @@ -614,9 +620,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, */ rid = UID_NOBODY; domain = nulldomain; -#ifdef sun } -#endif /* sun */ } idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); @@ -699,18 +703,16 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) #ifdef sun ksid_t *ksid = crgetsid(cr, KSID_GROUP); ksidlist_t *ksidlist = crgetsidlist(cr); -#endif /* sun */ +#endif /* !sun */ uid_t gid; #ifdef sun if (ksid && ksidlist) { int i; ksid_t *ksid_groups; - ksidlist_t *ksidlist = crgetsidlist(cr); uint32_t idx = FUID_INDEX(id); uint32_t rid = FUID_RID(id); - ASSERT(ksidlist); ksid_groups = ksidlist->ksl_sids; for (i = 0; i != ksidlist->ksl_nsid; i++) { @@ -736,7 +738,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) } } } -#endif /* sun */ +#endif /* !sun */ /* * Not found in ksidlist, check posix groups diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 9a68adf..52300ee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -47,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -65,14 +63,18 @@ #include #include #include +#include #include +#include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" +#include "zfs_comutil.h" +#include "zfs_ioctl_compat.h" -CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE); +CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX); static struct cdev *zfsdev; @@ -105,17 +107,22 @@ static const char *userquota_perms[] = { }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); -static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops); +static int zfs_check_settable(const char *name, nvpair_t *property, + cred_t *cr); +static int zfs_check_clearable(char *dataset, nvlist_t *props, + nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); -int zfs_set_prop_nvlist(const char *, nvlist_t *); +int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **); + +static void zfsdev_close(void *data); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; - char buf[256]; + char buf[512]; va_list adx; /* @@ -178,22 +185,15 @@ history_str_get(zfs_cmd_t *zc) static boolean_t zfs_is_bootfs(const char *name) { - spa_t *spa; - boolean_t ret = B_FALSE; - - if (spa_open(name, &spa, FTAG) == 0) { - if (spa->spa_bootfs) { - objset_t *os; + objset_t *os; - if (dmu_objset_open(name, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - ret = (dmu_objset_id(os) == spa->spa_bootfs); - dmu_objset_close(os); - } - } - spa_close(spa, FTAG); + if (dmu_objset_hold(name, FTAG, &os) == 0) { + boolean_t ret; + ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); + dmu_objset_rele(os, FTAG); + return (ret); } - return (ret); + return (B_FALSE); } /* @@ -227,13 +227,17 @@ zpl_earlier_version(const char *name, int version) objset_t *os; boolean_t rc = B_TRUE; - if (dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (B_TRUE); + } + /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); } return (rc); } @@ -282,9 +286,8 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) } static int -zfs_dozonecheck(const char *dataset, cred_t *cr) +zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { - uint64_t zoned; int writable = 1; /* @@ -295,9 +298,6 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) !zone_dataset_visible(dataset, &writable)) return (ENOENT); - if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) - return (ENOENT); - if (INGLOBALZONE(curthread)) { /* * If the fs is zoned, only root can access it from the @@ -319,6 +319,32 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) return (0); } +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + + if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) + return (ENOENT); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) +{ + uint64_t zoned; + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_prop_get_ds(ds, "jailed", 8, 1, &zoned, NULL)) { + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + return (ENOENT); + } + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { @@ -333,9 +359,126 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) return (error); } +int +zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, + const char *perm, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck_ds(name, ds, cr); + if (error == 0) { + error = secpolicy_zfs(cr); + if (error) + error = dsl_deleg_access_impl(ds, perm, cr); + } + return (error); +} + +#ifdef SECLABEL +/* + * Policy for setting the security label property. + * + * Returns 0 for success, non-zero for access and other errors. + */ +static int +zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) +{ + char ds_hexsl[MAXNAMELEN]; + bslabel_t ds_sl, new_sl; + boolean_t new_default = FALSE; + uint64_t zoned; + int needed_priv = -1; + int error; + + /* First get the existing dataset label. */ + error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (EPERM); + + if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) + new_default = TRUE; + + /* The label must be translatable */ + if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) + return (EINVAL); + + /* + * In a non-global zone, disallow attempts to set a label that + * doesn't match that of the zone; otherwise no other checks + * are needed. + */ + if (!INGLOBALZONE(curproc)) { + if (new_default || !blequal(&new_sl, CR_SL(CRED()))) + return (EPERM); + return (0); + } + + /* + * For global-zone datasets (i.e., those whose zoned property is + * "off", verify that the specified new label is valid for the + * global zone. + */ + if (dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (EPERM); + if (!zoned) { + if (zfs_check_global_label(name, strval) != 0) + return (EPERM); + } + + /* + * If the existing dataset label is nondefault, check if the + * dataset is mounted (label cannot be changed while mounted). + * Get the zfsvfs; if there isn't one, then the dataset isn't + * mounted (or isn't a dataset, doesn't exist, ...). + */ + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { + objset_t *os; + static char *setsl_tag = "setsl_tag"; + + /* + * Try to own the dataset; abort if there is any error, + * (e.g., already mounted, in use, or other error). + */ + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, + setsl_tag, &os); + if (error) + return (EPERM); + + dmu_objset_disown(os, setsl_tag); + + if (new_default) { + needed_priv = PRIV_FILE_DOWNGRADE_SL; + goto out_check; + } + + if (hexstr_to_label(strval, &new_sl) != 0) + return (EPERM); + + if (blstrictdom(&ds_sl, &new_sl)) + needed_priv = PRIV_FILE_DOWNGRADE_SL; + else if (blstrictdom(&new_sl, &ds_sl)) + needed_priv = PRIV_FILE_UPGRADE_SL; + } else { + /* dataset currently has a default label */ + if (!new_default) + needed_priv = PRIV_FILE_UPGRADE_SL; + } + +out_check: + if (needed_priv != -1) + return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); + return (0); +} +#endif /* SECLABEL */ + static int -zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) +zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, + cred_t *cr) { + char *strval; + /* * Check permissions for special properties. */ @@ -357,16 +500,33 @@ zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) * quota on things *under* (ie. contained by) * the thing they own. */ - if (dsl_prop_get_integer(name, "jailed", &zoned, + if (dsl_prop_get_integer(dsname, "jailed", &zoned, setpoint)) return (EPERM); - if (!zoned || strlen(name) <= strlen(setpoint)) + if (!zoned || strlen(dsname) <= strlen(setpoint)) return (EPERM); } break; + + case ZFS_PROP_MLSLABEL: +#ifdef SECLABEL + if (!is_system_labeled()) + return (EPERM); + + if (nvpair_value_string(propval, &strval) == 0) { + int err; + + err = zfs_set_slabel_policy(dsname, strval, CRED()); + if (err != 0) + return (err); + } +#else + return (EOPNOTSUPP); +#endif + break; } - return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr)); + return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } int @@ -388,20 +548,45 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) int zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) { - int error; - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_ROLLBACK, cr); - if (error == 0) - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr); - return (error); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_ROLLBACK, cr)); } int zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); + spa_t *spa; + dsl_pool_t *dp; + dsl_dataset_t *ds; + char *cp; + int error; + + /* + * Generate the current snapshot name from the given objsetid, then + * use that name for the secpolicy/zone checks. + */ + cp = strchr(zc->zc_name, '@'); + if (cp == NULL) + return (EINVAL); + error = spa_open(zc->zc_name, &spa, FTAG); + if (error) + return (error); + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + if (error) + return (error); + + dsl_dataset_name(ds, zc->zc_name); + + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND, cr); + dsl_dataset_rele(ds, FTAG); + + return (error); } static int @@ -495,19 +680,34 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) } /* - * Must have sys_config privilege to check the iscsi permission + * Destroying snapshots with delegated permissions requires + * descendent mount and destroy permissions. + * Reassemble the full filesystem@snap name so dsl_deleg_access() + * can do the correct permission check. + * + * Since this routine is used when doing a recursive destroy of snapshots + * and destroying snapshots requires descendent permissions, a successfull + * check of the top level snapshot applies to snapshots of all descendent + * datasets as well. */ -/* ARGSUSED */ static int -zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr) { - return (secpolicy_zfs(cr)); + int error; + char *dsname; + + dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + + error = zfs_secpolicy_destroy_perms(dsname, cr); + + strfree(dsname); + return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { - char parentname[MAXNAMELEN]; + char parentname[MAXNAMELEN]; int error; if ((error = zfs_secpolicy_write_perms(from, @@ -542,7 +742,7 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) static int zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) { - char parentname[MAXNAMELEN]; + char parentname[MAXNAMELEN]; objset_t *clone; int error; @@ -551,20 +751,19 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) if (error) return (error); - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &clone); + error = dmu_objset_hold(zc->zc_name, FTAG, &clone); if (error == 0) { dsl_dataset_t *pclone = NULL; dsl_dir_t *dd; - dd = clone->os->os_dsl_dataset->ds_dir; + dd = clone->os_dsl_dataset->ds_dir; rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_origin_obj, FTAG, &pclone); rw_exit(&dd->dd_pool->dp_config_rwlock); if (error) { - dmu_objset_close(clone); + dmu_objset_rele(clone, FTAG); return (error); } @@ -572,7 +771,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(pclone, parentname); - dmu_objset_close(clone); + dmu_objset_rele(clone, FTAG); dsl_dataset_rele(pclone, FTAG); if (error == 0) error = zfs_secpolicy_write_perms(parentname, @@ -601,16 +800,8 @@ zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { - int error; - - if ((error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0) - return (error); - - error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_MOUNT, cr); - - return (error); + return (zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_SNAPSHOT, cr)); } static int @@ -623,8 +814,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) static int zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) { - char parentname[MAXNAMELEN]; - int error; + char parentname[MAXNAMELEN]; + int error; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) @@ -673,19 +864,19 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) } /* - * Just like zfs_secpolicy_config, except that we will check for - * mount permission on the dataset for permission to create/remove - * the minor nodes. + * Policy for object to name lookups. */ +/* ARGSUSED */ static int -zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) { - if (secpolicy_sys_config(cr, B_FALSE) != 0) { - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr)); - } + int error; - return (0); + if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) + return (0); + + error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); + return (error); } /* @@ -709,28 +900,11 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { - if (!zfs_prop_inheritable(prop)) - return (EINVAL); - return (zfs_secpolicy_setprop(zc->zc_name, prop, cr)); + return (zfs_secpolicy_setprop(zc->zc_name, prop, + NULL, cr)); } } -/* - * Policy for dataset backup operations (sendbackup). - * Requires SYS_MOUNT privilege, and must be writable in the local zone. - */ -static int -zfs_secpolicy_operator(const char *dataset, cred_t *cr) -{ - int writable = 1; - - if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable)) - return (ENOENT); - if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr)) - return (EPERM); - return (0); -} - static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) { @@ -777,14 +951,56 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) { - return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr)); + return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, + NULL, cr)); +} + +static int +zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_HOLD, cr)); +} + +static int +zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RELEASE, cr)); +} + +/* + * Policy for allowing temporary snapshots to be taken or released + */ +static int +zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) +{ + /* + * A temporary snapshot is the same as a snapshot, + * hold, destroy and release all rolled into one. + * Delegated diff alone is sufficient that we allow this. + */ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_DIFF, cr)) == 0) + return (0); + + error = zfs_secpolicy_snapshot(zc, cr); + if (!error) + error = zfs_secpolicy_hold(zc, cr); + if (!error) + error = zfs_secpolicy_release(zc, cr); + if (!error) + error = zfs_secpolicy_destroy(zc, cr); + return (error); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int -get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) +get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; @@ -798,7 +1014,8 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) packed = kmem_alloc(size, KM_SLEEP); - if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) { + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { kmem_free(packed, size); return (error); } @@ -815,11 +1032,46 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) } static int +fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) +{ + size_t size; + + VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); + + if (size > zc->zc_nvlist_dst_size) { + nvpair_t *more_errors; + int n = 0; + + if (zc->zc_nvlist_dst_size < 1024) + return (ENOMEM); + + VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0); + more_errors = nvlist_prev_nvpair(*errors, NULL); + + do { + nvpair_t *pair = nvlist_prev_nvpair(*errors, + more_errors); + VERIFY(nvlist_remove_nvpair(*errors, pair) == 0); + n++; + VERIFY(nvlist_size(*errors, &size, + NV_ENCODE_NATIVE) == 0); + } while (size > zc->zc_nvlist_dst_size); + + VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0); + VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0); + ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); + ASSERT(size <= zc->zc_nvlist_dst_size); + } + + return (0); +} + +static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; + int error = 0; size_t size; - int error; VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); @@ -837,8 +1089,9 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) packed = kmem_alloc(size, KM_SLEEP); VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, KM_SLEEP) == 0); - error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size); + if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, + size, zc->zc_iflags) != 0) + error = EFAULT; kmem_free(packed, size); } @@ -847,25 +1100,28 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) } static int -getzfsvfs(const char *dsname, zfsvfs_t **zvp) +getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; - error = dmu_objset_open(dsname, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &os); + error = dmu_objset_hold(dsname, FTAG, &os); if (error) return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (EINVAL); + } - mutex_enter(&os->os->os_user_ptr_lock); - *zvp = dmu_objset_get_user(os); - if (*zvp) { - VFS_HOLD((*zvp)->z_vfs); + mutex_enter(&os->os_user_ptr_lock); + *zfvp = dmu_objset_get_user(os); + if (*zfvp) { + VFS_HOLD((*zfvp)->z_vfs); } else { error = ESRCH; } - mutex_exit(&os->os->os_user_ptr_lock); - dmu_objset_close(os); + mutex_exit(&os->os_user_ptr_lock); + dmu_objset_rele(os, FTAG); return (error); } @@ -874,22 +1130,22 @@ getzfsvfs(const char *dsname, zfsvfs_t **zvp) * case its z_vfs will be NULL, and it will be opened as the owner. */ static int -zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp) +zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; - int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0); - if (getzfsvfs(name, zvp) != 0) - error = zfsvfs_create(name, mode, zvp); + if (getzfsvfs(name, zfvp) != 0) + error = zfsvfs_create(name, zfvp); if (error == 0) { - rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag); - if ((*zvp)->z_unmounted) { + rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : + RW_READER, tag); + if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ - rrw_exit(&(*zvp)->z_teardown_lock, tag); + rrw_exit(&(*zfvp)->z_teardown_lock, tag); return (EBUSY); } } @@ -904,7 +1160,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) if (zfsvfs->z_vfs) { VFS_RELE(zfsvfs->z_vfs); } else { - dmu_objset_close(zfsvfs->z_os); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } } @@ -919,11 +1175,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) char *buf; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) + zc->zc_iflags, &config)) return (error); if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { nvlist_free(config); return (error); } @@ -962,8 +1219,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) /* * Set the remaining root properties */ - if (!error && - (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0) + if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, + ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); if (buf != NULL) @@ -984,22 +1241,25 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc) int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); + if (error == 0) + zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { - int error; nvlist_t *config, *props = NULL; uint64_t guid; + int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) != 0) + zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { nvlist_free(config); return (error); } @@ -1007,11 +1267,15 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = EINVAL; - else if (zc->zc_cookie) - error = spa_import_verbatim(zc->zc_name, config, - props); else - error = spa_import(zc->zc_name, config, props); + error = spa_import(zc->zc_name, config, props, zc->zc_cookie); + + if (zc->zc_nvlist_dst != 0) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + } nvlist_free(config); @@ -1030,6 +1294,8 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); + if (error == 0) + zvol_remove_minors(zc->zc_name); return (error); } @@ -1087,7 +1353,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &tryconfig)) != 0) + zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); @@ -1103,8 +1369,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_cookie scan func (pool_scan_func_t) + */ static int -zfs_ioc_pool_scrub(zfs_cmd_t *zc) +zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; @@ -1112,7 +1383,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_scrub(spa, zc->zc_cookie); + if (zc->zc_cookie == POOL_SCAN_NONE) + error = spa_scan_stop(spa); + else + error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); @@ -1175,9 +1449,9 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { - error = xcopyout(hist_buf, - (char *)(uintptr_t)zc->zc_history, - zc->zc_history_len); + error = ddi_copyout(hist_buf, + (void *)(uintptr_t)zc->zc_history, + zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); @@ -1196,18 +1470,59 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) return (0); } +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_value name of object + */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { - objset_t *osp; + objset_t *os; + int error; + + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (EINVAL); + } + error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_rele(os, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_stat stats on object + * zc_value path to object + */ +static int +zfs_ioc_obj_to_stats(zfs_cmd_t *zc) +{ + objset_t *os; int error; - if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0) + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); - error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (EINVAL); + } + error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); - dmu_objset_close(osp); + dmu_objset_rele(os, FTAG); return (error); } @@ -1217,20 +1532,15 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; -#ifdef sun nvlist_t *config, **l2cache, **spares; uint_t nl2cache = 0, nspares = 0; -#else - nvlist_t *config; -#endif error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config); -#ifdef sun + zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); @@ -1247,11 +1557,11 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) * * l2cache and spare devices are ok to be added to a rootpool. */ - if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) { + if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { + nvlist_free(config); spa_close(spa, FTAG); return (EDOM); } -#endif if (error == 0) { error = spa_vdev_add(spa, config); @@ -1261,6 +1571,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_nvlist_conf nvlist of devices to remove + * zc_cookie to stop the remove? + */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { @@ -1294,11 +1610,19 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc) break; case VDEV_STATE_FAULTED: - error = vdev_fault(spa, zc->zc_guid); + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: - error = vdev_degrade(spa, zc->zc_guid); + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: @@ -1321,7 +1645,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) == 0) { + zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } @@ -1346,15 +1670,50 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc) } static int -zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; - char *path = zc->zc_value; - uint64_t guid = zc->zc_guid; + nvlist_t *config, *props = NULL; int error; + boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) { + spa_close(spa, FTAG); + return (error); + } + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + spa_close(spa, FTAG); + nvlist_free(config); + return (error); + } + + error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); + + spa_close(spa, FTAG); + + nvlist_free(config); + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); @@ -1379,6 +1738,35 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) return (error); } +static int +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +{ + int error = 0; + nvlist_t *nv; + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_all(os, &nv)) == 0) { + dmu_objset_stats(os, nv); + /* + * NB: zvol_get_stats() will read the objset contents, + * which we aren't supposed to do with a + * DS_MODE_USER hold, because it could be + * inconsistent. So this is a bit of a workaround... + * XXX reading with out owning + */ + if (!zc->zc_objset_stats.dds_inconsistent) { + if (dmu_objset_type(os) == DMU_OST_ZVOL) + VERIFY(zvol_get_stats(os, nv) == 0); + } + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + return (error); +} + /* * inputs: * zc_name name of filesystem @@ -1394,34 +1782,59 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os = NULL; int error; + + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) + return (error); + + error = zfs_ioc_objset_stats_impl(zc, os); + + dmu_objset_rele(os, FTAG); + + if (error == ENOMEM) + error = 0; + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_nvlist_dst received property nvlist + * zc_nvlist_dst_size size of received property nvlist + * + * Gets received properties (distinct from local properties on or after + * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from + * local property values. + */ +static int +zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; nvlist_t *nv; - if (error = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (error); - dmu_objset_fast_stat(os, &zc->zc_objset_stats); + /* + * Without this check, we would return local property values if the + * caller has not already received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + if (!dsl_prop_get_hasrecvd(os)) { + dmu_objset_rele(os, FTAG); + return (ENOTSUP); + } if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) { - dmu_objset_stats(os, nv); - /* - * NB: zvol_get_stats() will read the objset contents, - * which we aren't supposed to do with a - * DS_MODE_USER hold, because it could be - * inconsistent. So this is a bit of a workaround... - */ - if (!zc->zc_objset_stats.dds_inconsistent) { - if (dmu_objset_type(os) == DMU_OST_ZVOL) - VERIFY(zvol_get_stats(os, nv) == 0); - } + (error = dsl_prop_get_received(os, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } - dmu_objset_close(os); - if (error == ENOMEM) - error = 0; + dmu_objset_rele(os, FTAG); return (error); } @@ -1456,8 +1869,8 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) objset_t *os; int err; - if (err = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) + /* XXX reading without owning */ + if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); @@ -1482,11 +1895,11 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) } else { err = ENOENT; } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); return (err); } -static boolean_t +boolean_t dataset_name_hidden(const char *name) { /* @@ -1522,9 +1935,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) objset_t *os; int error; char *p; + size_t orig_len = strlen(zc->zc_name); - if (error = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) { +top: + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { if (error == ENOENT) error = ESRCH; return (error); @@ -1544,7 +1958,7 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) int len = sizeof (zc->zc_name) - (p - zc->zc_name); while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) - (void) dmu_objset_prefetch(p, NULL); + (void) dmu_objset_prefetch(zc->zc_name, NULL); } do { @@ -1553,12 +1967,22 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; - } while (error == 0 && dataset_name_hidden(zc->zc_name)); - dmu_objset_close(os); + } while (error == 0 && dataset_name_hidden(zc->zc_name) && + !(zc->zc_iflags & FKIOCTL)); + dmu_objset_rele(os, FTAG); - if (error == 0) + /* + * If it's an internal dataset (ie. with a '$' in its name), + * don't try to get stats for it, otherwise we'll return ENOENT. + */ + if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - + if (error == ENOENT) { + /* We lost a race with destroy, get the next one. */ + zc->zc_name[orig_len] = '\0'; + goto top; + } + } return (error); } @@ -1580,299 +2004,363 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *os; int error; - error = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os); +top: + if (zc->zc_cookie == 0) + (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, + NULL, DS_FIND_SNAPSHOTS); + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error) return (error == ENOENT ? ESRCH : error); - if (zc->zc_cookie == 0) { - (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, - NULL, DS_FIND_SNAPSHOTS); - } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); return (ESRCH); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), - zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); - dmu_objset_close(os); - if (error == 0) - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - else if (error == ENOENT) + zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, + NULL); + + if (error == 0) { + dsl_dataset_t *ds; + dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; + + /* + * Since we probably don't have a hold on this snapshot, + * it's possible that the objsetid could have been destroyed + * and reused for a new objset. It's OK if this happens during + * a zfs send operation, since the new createtxg will be + * beyond the range we're interested in. + */ + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) { + if (error == ENOENT) { + /* Racing with destroy, get the next one. */ + *strchr(zc->zc_name, '@') = '\0'; + dmu_objset_rele(os, FTAG); + goto top; + } + } else { + objset_t *ossnap; + + error = dmu_objset_from_ds(ds, &ossnap); + if (error == 0) + error = zfs_ioc_objset_stats_impl(zc, ossnap); + dsl_dataset_rele(ds, FTAG); + } + } else if (error == ENOENT) { error = ESRCH; + } + dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error) *strchr(zc->zc_name, '@') = '\0'; return (error); } -int -zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) +static int +zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { - nvpair_t *elem; - int error = 0; - uint64_t intval; - char *strval; - nvlist_t *genericnvl; - boolean_t issnap = (strchr(name, '@') != NULL); + const char *propname = nvpair_name(pair); + uint64_t *valary; + unsigned int vallen; + const char *domain; + char *dash; + zfs_userquota_prop_t type; + uint64_t rid; + uint64_t quota; + zfsvfs_t *zfsvfs; + int err; + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) != 0) + return (EINVAL); + } /* - * First validate permission to set all of the properties + * A correctly constructed propname is encoded as + * userquota@-. */ - VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); - zfs_prop_t prop = zfs_name_to_prop(propname); + if ((dash = strchr(propname, '-')) == NULL || + nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || + vallen != 3) + return (EINVAL); - if (prop == ZPROP_INVAL) { - /* - * If this is a user-defined property, it must be a - * string, and there is no further validation to do. - */ - if (zfs_prop_user(propname) && - nvpair_type(elem) == DATA_TYPE_STRING) { - if (error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_USERPROP, CRED())) - return (error); - continue; - } + domain = dash + 1; + type = valary[0]; + rid = valary[1]; + quota = valary[2]; - if (!issnap && zfs_prop_userquota(propname) && - nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) { - const char *perm; - const char *up = zfs_userquota_prop_prefixes - [ZFS_PROP_USERQUOTA]; - if (strncmp(propname, up, strlen(up)) == 0) - perm = ZFS_DELEG_PERM_USERQUOTA; - else - perm = ZFS_DELEG_PERM_GROUPQUOTA; - if (error = zfs_secpolicy_write_perms(name, - perm, CRED())) - return (error); - continue; - } + err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); + if (err == 0) { + err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); + zfsvfs_rele(zfsvfs, FTAG); + } - return (EINVAL); - } + return (err); +} - if (issnap) - return (EINVAL); +/* + * If the named property is one that has a special function to set its value, + * return 0 on success and a positive error code on failure; otherwise if it is + * not one of the special properties handled by this function, return -1. + * + * XXX: It would be better for callers of the property interface if we handled + * these special cases in dsl_prop.c (in the dsl layer). + */ +static int +zfs_prop_set_special(const char *dsname, zprop_source_t source, + nvpair_t *pair) +{ + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval; + int err; - if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) - return (error); + if (prop == ZPROP_INVAL) { + if (zfs_prop_userquota(propname)) + return (zfs_prop_set_userquota(dsname, pair)); + return (-1); + } - /* - * Check that this value is valid for this pool version - */ - switch (prop) { - case ZFS_PROP_COMPRESSION: - /* - * If the user specified gzip compression, make sure - * the SPA supports it. We ignore any errors here since - * we'll catch them later. - */ - if (nvpair_type(elem) == DATA_TYPE_UINT64 && - nvpair_value_uint64(elem, &intval) == 0) { - if (intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9 && - zfs_earlier_version(name, - SPA_VERSION_GZIP_COMPRESSION)) - return (ENOTSUP); + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } - /* - * If this is a bootable dataset then - * verify that the compression algorithm - * is supported for booting. We must return - * something other than ENOTSUP since it - * implies a downrev pool version. - */ - if (zfs_is_bootfs(name) && - !BOOTFS_COMPRESS_VALID(intval)) - return (ERANGE); - } - break; + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) + return (-1); - case ZFS_PROP_COPIES: - if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS)) - return (ENOTSUP); - break; + VERIFY(0 == nvpair_value_uint64(pair, &intval)); - case ZFS_PROP_SHARESMB: - if (zpl_earlier_version(name, ZPL_VERSION_FUID)) - return (ENOTSUP); + switch (prop) { + case ZFS_PROP_QUOTA: + err = dsl_dir_set_quota(dsname, source, intval); + break; + case ZFS_PROP_REFQUOTA: + err = dsl_dataset_set_quota(dsname, source, intval); + break; + case ZFS_PROP_RESERVATION: + err = dsl_dir_set_reservation(dsname, source, intval); + break; + case ZFS_PROP_REFRESERVATION: + err = dsl_dataset_set_reservation(dsname, source, intval); + break; + case ZFS_PROP_VOLSIZE: + err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip), + intval); + break; + case ZFS_PROP_VERSION: + { + zfsvfs_t *zfsvfs; + + if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; - case ZFS_PROP_ACLINHERIT: - if (nvpair_type(elem) == DATA_TYPE_UINT64 && - nvpair_value_uint64(elem, &intval) == 0) - if (intval == ZFS_ACL_PASSTHROUGH_X && - zfs_earlier_version(name, - SPA_VERSION_PASSTHROUGH_X)) - return (ENOTSUP); - } - } + err = zfs_set_version(zfsvfs, intval); + zfsvfs_rele(zfsvfs, FTAG); - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); - zfs_prop_t prop = zfs_name_to_prop(propname); + if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { + zfs_cmd_t *zc; - if (prop == ZPROP_INVAL) { - if (zfs_prop_userquota(propname)) { - uint64_t *valary; - unsigned int vallen; - const char *domain; - zfs_userquota_prop_t type; - uint64_t rid; - uint64_t quota; - zfsvfs_t *zfsvfs; - - VERIFY(nvpair_value_uint64_array(elem, - &valary, &vallen) == 0); - VERIFY(vallen == 3); - type = valary[0]; - rid = valary[1]; - quota = valary[2]; - domain = propname + - strlen(zfs_userquota_prop_prefixes[type]); - - error = zfsvfs_hold(name, B_FALSE, FTAG, - &zfsvfs); - if (error == 0) { - error = zfs_set_userquota(zfsvfs, - type, domain, rid, quota); - zfsvfs_rele(zfsvfs, FTAG); - } - if (error == 0) - continue; - else - goto out; - } else if (zfs_prop_user(propname)) { - VERIFY(nvpair_value_string(elem, &strval) == 0); - error = dsl_prop_set(name, propname, 1, - strlen(strval) + 1, strval); - if (error == 0) - continue; - else - goto out; - } + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strcpy(zc->zc_name, dsname); + (void) zfs_ioc_userspace_upgrade(zc); + kmem_free(zc, sizeof (zfs_cmd_t)); } + break; + } - switch (prop) { - case ZFS_PROP_QUOTA: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_quota(name, intval)) != 0) - goto out; - break; - - case ZFS_PROP_REFQUOTA: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dataset_set_quota(name, intval)) != 0) - goto out; - break; - - case ZFS_PROP_RESERVATION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_reservation(name, - intval)) != 0) - goto out; - break; + default: + err = -1; + } - case ZFS_PROP_REFRESERVATION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dataset_set_reservation(name, - intval)) != 0) - goto out; - break; + return (err); +} - case ZFS_PROP_VOLSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volsize(name, - ddi_driver_major(zfs_dip), intval)) != 0) - goto out; - break; +/* + * This function is best effort. If it fails to set any of the given properties, + * it continues to set as many as it can and returns the first error + * encountered. If the caller provides a non-NULL errlist, it also gives the + * complete list of names of all the properties it failed to set along with the + * corresponding error numbers. The caller is responsible for freeing the + * returned errlist. + * + * If every property is set successfully, zero is returned and the list pointed + * at by errlist is NULL. + */ +int +zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, + nvlist_t **errlist) +{ + nvpair_t *pair; + nvpair_t *propval; + int rv = 0; + uint64_t intval; + char *strval; + nvlist_t *genericnvl; + nvlist_t *errors; + nvlist_t *retrynvl; - case ZFS_PROP_VOLBLOCKSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volblocksize(name, intval)) != 0) - goto out; - break; + VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); - case ZFS_PROP_VERSION: - { - zfsvfs_t *zfsvfs; - - if ((error = nvpair_value_uint64(elem, &intval)) != 0) - goto out; - if ((error = zfsvfs_hold(name, B_FALSE, FTAG, - &zfsvfs)) != 0) - goto out; - error = zfs_set_version(zfsvfs, intval); - zfsvfs_rele(zfsvfs, FTAG); - - if (error == 0 && intval >= ZPL_VERSION_USERSPACE) { - zfs_cmd_t zc = { 0 }; - (void) strcpy(zc.zc_name, name); - (void) zfs_ioc_userspace_upgrade(&zc); - } - if (error) - goto out; - break; +retry: + pair = NULL; + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + int err = 0; + + /* decode the property value */ + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &propval) != 0) + err = EINVAL; } - default: - if (nvpair_type(elem) == DATA_TYPE_STRING) { - if (zfs_prop_get_type(prop) != - PROP_TYPE_STRING) { - error = EINVAL; - goto out; - } - } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + /* Validate value type */ + if (err == 0 && prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if (nvpair_type(propval) != DATA_TYPE_STRING) + err = EINVAL; + } else if (zfs_prop_userquota(propname)) { + if (nvpair_type(propval) != + DATA_TYPE_UINT64_ARRAY) + err = EINVAL; + } + } else if (err == 0) { + if (nvpair_type(propval) == DATA_TYPE_STRING) { + if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) + err = EINVAL; + } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; - VERIFY(nvpair_value_uint64(elem, &intval) == 0); + VERIFY(nvpair_value_uint64(propval, + &intval) == 0); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: - error = EINVAL; - goto out; + err = EINVAL; + break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, - intval, &unused) != 0) { - error = EINVAL; - goto out; - } + intval, &unused) != 0) + err = EINVAL; break; default: cmn_err(CE_PANIC, "unknown property type"); - break; } } else { - error = EINVAL; - goto out; + err = EINVAL; } - if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0) - goto out; } + + /* Validate permissions */ + if (err == 0) + err = zfs_check_settable(dsname, pair, CRED()); + + if (err == 0) { + err = zfs_prop_set_special(dsname, source, pair); + if (err == -1) { + /* + * For better performance we build up a list of + * properties to set in a single transaction. + */ + err = nvlist_add_nvpair(genericnvl, pair); + } else if (err != 0 && nvl != retrynvl) { + /* + * This may be a spurious error caused by + * receiving quota and reservation out of order. + * Try again in a second pass. + */ + err = nvlist_add_nvpair(retrynvl, pair); + } + } + + if (err != 0) + VERIFY(nvlist_add_int32(errors, propname, err) == 0); } - if (nvlist_next_nvpair(genericnvl, NULL) != NULL) { - error = dsl_props_set(name, genericnvl); + if (nvl != retrynvl && !nvlist_empty(retrynvl)) { + nvl = retrynvl; + goto retry; + } + + if (!nvlist_empty(genericnvl) && + dsl_props_set(dsname, source, genericnvl) != 0) { + /* + * If this fails, we still want to set as many properties as we + * can, so try setting them individually. + */ + pair = NULL; + while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + int err = 0; + + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &propval) == 0); + } + + if (nvpair_type(propval) == DATA_TYPE_STRING) { + VERIFY(nvpair_value_string(propval, + &strval) == 0); + err = dsl_prop_set(dsname, propname, source, 1, + strlen(strval) + 1, strval); + } else { + VERIFY(nvpair_value_uint64(propval, + &intval) == 0); + err = dsl_prop_set(dsname, propname, source, 8, + 1, &intval); + } + + if (err != 0) { + VERIFY(nvlist_add_int32(errors, propname, + err) == 0); + } + } } -out: nvlist_free(genericnvl); - return (error); + nvlist_free(retrynvl); + + if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { + nvlist_free(errors); + errors = NULL; + } else { + VERIFY(nvpair_value_int32(pair, &rv) == 0); + } + + if (errlist == NULL) + nvlist_free(errors); + else + *errlist = errors; + + return (rv); } /* @@ -1881,15 +2369,15 @@ out: static int zfs_check_userprops(char *fsname, nvlist_t *nvl) { - nvpair_t *elem = NULL; + nvpair_t *pair = NULL; int error = 0; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); char *valstr; if (!zfs_prop_user(propname) || - nvpair_type(elem) != DATA_TYPE_STRING) + nvpair_type(pair) != DATA_TYPE_STRING) return (EINVAL); if (error = zfs_secpolicy_write_perms(fsname, @@ -1899,49 +2387,96 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl) if (strlen(propname) >= ZAP_MAXNAMELEN) return (ENAMETOOLONG); - VERIFY(nvpair_value_string(elem, &valstr) == 0); + VERIFY(nvpair_value_string(pair, &valstr) == 0); if (strlen(valstr) >= ZAP_MAXVALUELEN) return (E2BIG); } return (0); } +static void +props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) +{ + nvpair_t *pair; + + VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + pair = NULL; + while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { + if (nvlist_exists(skipped, nvpair_name(pair))) + continue; + + VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); + } +} + +static int +clear_received_props(objset_t *os, const char *fs, nvlist_t *props, + nvlist_t *skipped) +{ + int err = 0; + nvlist_t *cleared_props = NULL; + props_skip(props, skipped, &cleared_props); + if (!nvlist_empty(cleared_props)) { + /* + * Acts on local properties until the dataset has received + * properties at least once on or after SPA_VERSION_RECVD_PROPS. + */ + zprop_source_t flags = (ZPROP_SRC_NONE | + (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); + err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); + } + nvlist_free(cleared_props); + return (err); +} + /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply - * zc_cookie clear existing local props? + * zc_cookie received properties flag * - * outputs: none + * outputs: + * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : + ZPROP_SRC_LOCAL); + nvlist_t *errors = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvl)) != 0) + zc->zc_iflags, &nvl)) != 0) return (error); - if (zc->zc_cookie) { + if (received) { nvlist_t *origprops; objset_t *os; - if (dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - if (dsl_prop_get_all(os, &origprops, TRUE) == 0) { - clear_props(zc->zc_name, origprops, nvl); + if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { + if (dsl_prop_get_received(os, &origprops) == 0) { + (void) clear_received_props(os, + zc->zc_name, origprops, nvl); nvlist_free(origprops); } - dmu_objset_close(os); - } + dsl_prop_set_hasrecvd(os); + dmu_objset_rele(os, FTAG); + } } - error = zfs_set_prop_nvlist(zc->zc_name, nvl); + error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors); + + if (zc->zc_nvlist_dst != 0 && errors != NULL) { + (void) put_nvlist(zc, errors); + } + nvlist_free(errors); nvlist_free(nvl); return (error); } @@ -1950,14 +2485,75 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) * inputs: * zc_name name of filesystem * zc_value name of property to inherit + * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { + const char *propname = zc->zc_value; + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received + ? ZPROP_SRC_NONE /* revert to received value, if any */ + : ZPROP_SRC_INHERITED); /* explicitly inherit */ + + if (received) { + nvlist_t *dummy; + nvpair_t *pair; + zprop_type_t type; + int err; + + /* + * zfs_prop_set_special() expects properties in the form of an + * nvpair with type info. + */ + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(propname)) + return (EINVAL); + + type = PROP_TYPE_STRING; + } else if (prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VERSION) { + return (EINVAL); + } else { + type = zfs_prop_get_type(prop); + } + + VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + switch (type) { + case PROP_TYPE_STRING: + VERIFY(0 == nvlist_add_string(dummy, propname, "")); + break; + case PROP_TYPE_NUMBER: + case PROP_TYPE_INDEX: + VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); + break; + default: + nvlist_free(dummy); + return (EINVAL); + } + + pair = nvlist_next_nvpair(dummy, NULL); + err = zfs_prop_set_special(zc->zc_name, source, pair); + nvlist_free(dummy); + if (err != -1) + return (err); /* special property already handled */ + } else { + /* + * Only check this in the non-received case. We want to allow + * 'inherit -S' to revert non-inheritable properties like quota + * and reservation to the received or default values even though + * they are not considered inheritable. + */ + if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + return (EINVAL); + } + /* the property name has been validated by zfs_secpolicy_inherit() */ - return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); + return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); } static int @@ -1966,28 +2562,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) nvlist_t *props; spa_t *spa; int error; - nvpair_t *elem; + nvpair_t *pair; - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props))) + if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props)) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ - elem = nvlist_next_nvpair(props, NULL); - if (elem != NULL && strcmp(nvpair_name(elem), + pair = nvlist_next_nvpair(props, NULL); + if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && - nvlist_next_nvpair(props, elem) == NULL) { + nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); - if (spa != NULL) + if (spa != NULL) { + nvlist_free(props); return (0); + } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { @@ -2034,57 +2632,6 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) return (error); } -static int -zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) -{ -#ifdef sun - nvlist_t *nvp; - int error; - uint32_t uid; - uint32_t gid; - uint32_t *groups; - uint_t group_cnt; - cred_t *usercred; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvp)) != 0) { - return (error); - } - - if ((error = nvlist_lookup_uint32(nvp, - ZFS_DELEG_PERM_UID, &uid)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - - if ((error = nvlist_lookup_uint32(nvp, - ZFS_DELEG_PERM_GID, &gid)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - - if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS, - &groups, &group_cnt)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - usercred = cralloc(); - if ((crsetugid(usercred, uid, gid) != 0) || - (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) { - nvlist_free(nvp); - crfree(usercred); - return (EPERM); - } - nvlist_free(nvp); - error = dsl_deleg_access(zc->zc_name, - zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); - crfree(usercred); - return (error); -#else /* sun */ - return (EPERM); -#endif /* sun */ -} - /* * inputs: * zc_name name of filesystem @@ -2100,7 +2647,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &fsaclnv)) != 0) + zc->zc_iflags, &fsaclnv)) != 0) return (error); /* @@ -2157,30 +2704,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc) } /* - * inputs: - * zc_name name of volume - * - * outputs: none - */ -static int -zfs_ioc_create_minor(zfs_cmd_t *zc) -{ - return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip))); -} - -/* - * inputs: - * zc_name name of volume - * - * outputs: none - */ -static int -zfs_ioc_remove_minor(zfs_cmd_t *zc) -{ - return (zvol_remove_minor(zc->zc_name)); -} - -/* * Search the vfs list for a specified resource. Returns a pointer to it * or NULL if no suitable entry is found. The caller of this routine * is responsible for releasing the returned vfs pointer. @@ -2234,8 +2757,8 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, - boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, - boolean_t *is_ci) + boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; @@ -2271,6 +2794,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || + (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) @@ -2312,11 +2836,13 @@ static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - boolean_t fuids_ok = B_TRUE; + boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[MAXNAMELEN]; char *cp; + spa_t *spa; + uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); @@ -2324,23 +2850,25 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, ASSERT(cp != NULL); cp[0] = '\0'; - if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE)) - zplver = ZPL_VERSION_USERSPACE - 1; - if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { - zplver = ZPL_VERSION_FUID - 1; - fuids_ok = B_FALSE; - } + if ((error = spa_open(dataset, &spa, FTAG)) != 0) + return (error); + + spa_vers = spa_version(spa); + spa_close(spa, FTAG); + + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ - if ((error = dmu_objset_open(parentname, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) + if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); - error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops, + error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); return (error); } @@ -2348,17 +2876,17 @@ static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - boolean_t fuids_ok = B_TRUE; + boolean_t fuids_ok; + boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; - if (spa_vers < SPA_VERSION_FUID) { - zplver = ZPL_VERSION_FUID - 1; - fuids_ok = B_FALSE; - } + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); - error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops, - zplprops, is_ci); + error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, + createprops, zplprops, is_ci); return (error); } @@ -2401,7 +2929,7 @@ zfs_ioc_create(zfs_cmd_t *zc) if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvprops)) != 0) + zc->zc_iflags, &nvprops)) != 0) return (error); zct.zct_zplprops = NULL; @@ -2417,21 +2945,18 @@ zfs_ioc_create(zfs_cmd_t *zc) return (EINVAL); } - error = dmu_objset_open(zc->zc_value, type, - DS_MODE_USER | DS_MODE_READONLY, &clone); + error = dmu_objset_hold(zc->zc_value, FTAG, &clone); if (error) { nvlist_free(nvprops); return (error); } - error = dmu_objset_create(zc->zc_name, type, clone, 0, - NULL, NULL); + error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0); + dmu_objset_rele(clone, FTAG); if (error) { - dmu_objset_close(clone); nvlist_free(nvprops); return (error); } - dmu_objset_close(clone); } else { boolean_t is_insensitive = B_FALSE; @@ -2488,7 +3013,7 @@ zfs_ioc_create(zfs_cmd_t *zc) return (error); } } - error = dmu_objset_create(zc->zc_name, type, NULL, + error = dmu_objset_create(zc->zc_name, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); nvlist_free(zct.zct_zplprops); } @@ -2497,10 +3022,16 @@ zfs_ioc_create(zfs_cmd_t *zc) * It would be nice to do this atomically. */ if (error == 0) { - if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) - (void) dmu_objset_destroy(zc->zc_name); + error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, + nvprops, NULL); + if (error != 0) + (void) dmu_objset_destroy(zc->zc_name, B_FALSE); } nvlist_free(nvprops); +#ifdef __FreeBSD__ + if (error == 0 && type == DMU_OST_ZVOL) + zvol_create_minors(zc->zc_name); +#endif return (error); } @@ -2511,7 +3042,8 @@ zfs_ioc_create(zfs_cmd_t *zc) * zc_cookie recursive flag * zc_nvlist_src[_size] property list * - * outputs: none + * outputs: + * zc_value short snapname (i.e. part after the '@') */ static int zfs_ioc_snapshot(zfs_cmd_t *zc) @@ -2525,21 +3057,21 @@ zfs_ioc_snapshot(zfs_cmd_t *zc) if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvprops)) != 0) + zc->zc_iflags, &nvprops)) != 0) return (error); error = zfs_check_userprops(zc->zc_name, nvprops); if (error) goto out; - if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL && + if (!nvlist_empty(nvprops) && zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { error = ENOTSUP; goto out; } - error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, - nvprops, recursive); + error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL, + nvprops, recursive, B_FALSE, -1); out: nvlist_free(nvprops); @@ -2547,20 +3079,15 @@ out: } int -zfs_unmount_snap(char *name, void *arg) +zfs_unmount_snap(const char *name, void *arg) { vfs_t *vfsp = NULL; if (arg) { char *snapname = arg; - int len = strlen(name) + strlen(snapname) + 2; - char *buf = kmem_alloc(len, KM_SLEEP); - - (void) strcpy(buf, name); - (void) strcat(buf, "@"); - (void) strcat(buf, snapname); - vfsp = zfs_get_vfs(buf); - kmem_free(buf, len); + char *fullname = kmem_asprintf("%s@%s", name, snapname); + vfsp = zfs_get_vfs(fullname); + strfree(fullname); } else if (strchr(name, '@')) { vfsp = zfs_get_vfs(name); } @@ -2586,8 +3113,9 @@ zfs_unmount_snap(char *name, void *arg) /* * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot + * zc_name name of filesystem + * zc_value short name of snapshot + * zc_defer_destroy mark for deferred destroy * * outputs: none */ @@ -2602,126 +3130,408 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc) zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); if (err) return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); + return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, + zc->zc_defer_destroy)); +} + +/* + * inputs: + * zc_name name of dataset to destroy + * zc_objset_type type of objset + * zc_defer_destroy mark for deferred destroy + * + * outputs: none + */ +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + int err; + if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { + err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + + err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); + if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) + (void) zvol_remove_minor(zc->zc_name); + return (err); } -/* - * inputs: - * zc_name name of dataset to destroy - * zc_objset_type type of objset - * - * outputs: none - */ -static int -zfs_ioc_destroy(zfs_cmd_t *zc) -{ - if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); +/* + * inputs: + * zc_name name of dataset to rollback (to most recent snapshot) + * + * outputs: none + */ +static int +zfs_ioc_rollback(zfs_cmd_t *zc) +{ + dsl_dataset_t *ds, *clone; + int error; + zfsvfs_t *zfsvfs; + char *clone_name; + + error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); + if (error) + return (error); + + /* must not be a snapshot */ + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + /* must have a most recent snapshot */ + if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + /* + * Create clone of most recent snapshot. + */ + clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); + error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); + if (error) + goto out; + + error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); + if (error) + goto out; + + /* + * Do clone swap. + */ + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { + error = zfs_suspend_fs(zfsvfs); + if (error == 0) { + int resume_err; + + if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { + error = dsl_dataset_clone_swap(clone, ds, + B_TRUE); + dsl_dataset_disown(ds, FTAG); + ds = NULL; + } else { + error = EBUSY; + } + resume_err = zfs_resume_fs(zfsvfs, zc->zc_name); + error = error ? error : resume_err; + } + VFS_RELE(zfsvfs->z_vfs); + } else { + if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { + error = dsl_dataset_clone_swap(clone, ds, B_TRUE); + dsl_dataset_disown(ds, FTAG); + ds = NULL; + } else { + error = EBUSY; + } + } + + /* + * Destroy clone (which also closes it). + */ + (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); + +out: + strfree(clone_name); + if (ds) + dsl_dataset_rele(ds, FTAG); + return (error); +} + +/* + * inputs: + * zc_name old name of dataset + * zc_value new name of dataset + * zc_cookie recursive flag (only valid for snapshots) + * + * outputs: none + */ +static int +zfs_ioc_rename(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie & 1; + + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '%')) + return (EINVAL); + + /* + * Unmount snapshot unless we're doing a recursive rename, + * in which case the dataset code figures out which snapshots + * to unmount. + */ + if (!recursive && strchr(zc->zc_name, '@') != NULL && + zc->zc_objset_type == DMU_OST_ZFS) { + int err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + if (zc->zc_objset_type == DMU_OST_ZVOL) + (void) zvol_remove_minor(zc->zc_name); + return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); +} + +static int +zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) +{ + const char *propname = nvpair_name(pair); + boolean_t issnap = (strchr(dsname, '@') != NULL); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval; + int err; + + if (prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if (err = zfs_secpolicy_write_perms(dsname, + ZFS_DELEG_PERM_USERPROP, cr)) + return (err); + return (0); + } + + if (!issnap && zfs_prop_userquota(propname)) { + const char *perm = NULL; + const char *uq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; + const char *gq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; + + if (strncmp(propname, uq_prefix, + strlen(uq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_USERQUOTA; + } else if (strncmp(propname, gq_prefix, + strlen(gq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_GROUPQUOTA; + } else { + /* USERUSED and GROUPUSED are read-only */ + return (EINVAL); + } + + if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) + return (err); + return (0); + } + + return (EINVAL); + } + + if (issnap) + return (EINVAL); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + /* + * Check that this value is valid for this pool version + */ + switch (prop) { + case ZFS_PROP_COMPRESSION: + /* + * If the user specified gzip compression, make sure + * the SPA supports it. We ignore any errors here since + * we'll catch them later. + */ + if (nvpair_type(pair) == DATA_TYPE_UINT64 && + nvpair_value_uint64(pair, &intval) == 0) { + if (intval >= ZIO_COMPRESS_GZIP_1 && + intval <= ZIO_COMPRESS_GZIP_9 && + zfs_earlier_version(dsname, + SPA_VERSION_GZIP_COMPRESSION)) { + return (ENOTSUP); + } + + if (intval == ZIO_COMPRESS_ZLE && + zfs_earlier_version(dsname, + SPA_VERSION_ZLE_COMPRESSION)) + return (ENOTSUP); + + /* + * If this is a bootable dataset then + * verify that the compression algorithm + * is supported for booting. We must return + * something other than ENOTSUP since it + * implies a downrev pool version. + */ + if (zfs_is_bootfs(dsname) && + !BOOTFS_COMPRESS_VALID(intval)) { + return (ERANGE); + } + } + break; + + case ZFS_PROP_COPIES: + if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) + return (ENOTSUP); + break; + + case ZFS_PROP_DEDUP: + if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) + return (ENOTSUP); + break; + + case ZFS_PROP_SHARESMB: + if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) + return (ENOTSUP); + break; + + case ZFS_PROP_ACLINHERIT: + if (nvpair_type(pair) == DATA_TYPE_UINT64 && + nvpair_value_uint64(pair, &intval) == 0) { + if (intval == ZFS_ACL_PASSTHROUGH_X && + zfs_earlier_version(dsname, + SPA_VERSION_PASSTHROUGH_X)) + return (ENOTSUP); + } + break; } - return (dmu_objset_destroy(zc->zc_name)); + return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* - * inputs: - * zc_name name of dataset to rollback (to most recent snapshot) + * Removes properties from the given props list that fail permission checks + * needed to clear them and to restore them in case of a receive error. For each + * property, make sure we have both set and inherit permissions. * - * outputs: none + * Returns the first error encountered if any permission checks fail. If the + * caller provides a non-NULL errlist, it also gives the complete list of names + * of all the properties that failed a permission check along with the + * corresponding error numbers. The caller is responsible for freeing the + * returned errlist. + * + * If every property checks out successfully, zero is returned and the list + * pointed at by errlist is NULL. */ static int -zfs_ioc_rollback(zfs_cmd_t *zc) +zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { - objset_t *os; - int error; - zfsvfs_t *zfsvfs = NULL; - - /* - * Get the zfsvfs for the receiving objset. There - * won't be one if we're operating on a zvol, if the - * objset doesn't exist yet, or is not mounted. - */ - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os); - if (error) - return (error); + zfs_cmd_t *zc; + nvpair_t *pair, *next_pair; + nvlist_t *errors; + int err, rv = 0; - if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { - int mode; + if (props == NULL) + return (0); - error = zfs_suspend_fs(zfsvfs, NULL, &mode); - if (error == 0) { - int resume_err; + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); - error = dmu_objset_rollback(os); - resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode); - error = error ? error : resume_err; - } else { - dmu_objset_close(os); + zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strcpy(zc->zc_name, dataset); + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + next_pair = nvlist_next_nvpair(props, pair); + + (void) strcpy(zc->zc_value, nvpair_name(pair)); + if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || + (err = zfs_secpolicy_inherit(zc, CRED())) != 0) { + VERIFY(nvlist_remove_nvpair(props, pair) == 0); + VERIFY(nvlist_add_int32(errors, + zc->zc_value, err) == 0); } - VFS_RELE(zfsvfs->z_vfs); + pair = next_pair; + } + kmem_free(zc, sizeof (zfs_cmd_t)); + + if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { + nvlist_free(errors); + errors = NULL; } else { - error = dmu_objset_rollback(os); + VERIFY(nvpair_value_int32(pair, &rv) == 0); } - /* Note, the dmu_objset_rollback() releases the objset for us. */ - return (error); + if (errlist == NULL) + nvlist_free(errors); + else + *errlist = errors; + + return (rv); } -/* - * inputs: - * zc_name old name of dataset - * zc_value new name of dataset - * zc_cookie recursive flag (only valid for snapshots) - * - * outputs: none - */ -static int -zfs_ioc_rename(zfs_cmd_t *zc) +static boolean_t +propval_equals(nvpair_t *p1, nvpair_t *p2) { - boolean_t recursive = zc->zc_cookie & 1; + if (nvpair_type(p1) == DATA_TYPE_NVLIST) { + /* dsl_prop_get_all_impl() format */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p1) == 0); + } - zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '%')) - return (EINVAL); + if (nvpair_type(p2) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p2) == 0); + } - /* - * Unmount snapshot unless we're doing a recursive rename, - * in which case the dataset code figures out which snapshots - * to unmount. - */ - if (!recursive && strchr(zc->zc_name, '@') != NULL && - zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); + if (nvpair_type(p1) != nvpair_type(p2)) + return (B_FALSE); + + if (nvpair_type(p1) == DATA_TYPE_STRING) { + char *valstr1, *valstr2; + + VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); + VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); + return (strcmp(valstr1, valstr2) == 0); + } else { + uint64_t intval1, intval2; + + VERIFY(nvpair_value_uint64(p1, &intval1) == 0); + VERIFY(nvpair_value_uint64(p2, &intval2) == 0); + return (intval1 == intval2); } - return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); } +/* + * Remove properties from props if they are not going to change (as determined + * by comparison with origprops). Remove them from origprops as well, since we + * do not need to clear or restore properties that won't change. + */ static void -clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops) +props_reduce(nvlist_t *props, nvlist_t *origprops) { - zfs_cmd_t *zc; - nvpair_t *prop; + nvpair_t *pair, *next_pair; - if (props == NULL) - return; - zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); - (void) strcpy(zc->zc_name, dataset); - for (prop = nvlist_next_nvpair(props, NULL); prop; - prop = nvlist_next_nvpair(props, prop)) { - if (newprops != NULL && - nvlist_exists(newprops, nvpair_name(prop))) - continue; - (void) strcpy(zc->zc_value, nvpair_name(prop)); - if (zfs_secpolicy_inherit(zc, CRED()) == 0) - (void) zfs_ioc_inherit_prop(zc); + if (origprops == NULL) + return; /* all props need to be received */ + + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + const char *propname = nvpair_name(pair); + nvpair_t *match; + + next_pair = nvlist_next_nvpair(props, pair); + + if ((nvlist_lookup_nvpair(origprops, propname, + &match) != 0) || !propval_equals(pair, match)) + goto next; /* need to set received value */ + + /* don't clear the existing received value */ + (void) nvlist_remove_nvpair(origprops, match); + /* don't bother receiving the property */ + (void) nvlist_remove_nvpair(props, pair); +next: + pair = next_pair; } - kmem_free(zc, sizeof (zfs_cmd_t)); } +#ifdef DEBUG +static boolean_t zfs_ioc_recv_inject_err; +#endif + /* * inputs: * zc_name name of containing filesystem @@ -2731,9 +3541,14 @@ clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag + * zc_cleanup_fd cleanup-on-exit file descriptor + * zc_action_handle handle for this guid/ds mapping (or zero on first call) * * outputs: * zc_cookie number of bytes read + * zc_nvlist_dst{_size} error for each unapplied received property + * zc_obj zprop_errflags_t + * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) @@ -2741,15 +3556,18 @@ zfs_ioc_recv(zfs_cmd_t *zc) file_t *fp; objset_t *os; dmu_recv_cookie_t drc; - zfsvfs_t *zfsvfs = NULL; boolean_t force = (boolean_t)zc->zc_guid; - int error, fd; + int fd; + int error = 0; + int props_error = 0; + nvlist_t *errors; offset_t off; - nvlist_t *props = NULL; - nvlist_t *origprops = NULL; + nvlist_t *props = NULL; /* sent properties */ + nvlist_t *origprops = NULL; /* existing properties */ objset_t *origin = NULL; char *tosnap; char tofs[ZFS_MAXNAMELEN]; + boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || @@ -2758,123 +3576,204 @@ zfs_ioc_recv(zfs_cmd_t *zc) (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); - *tosnap = '\0'; - tosnap++; + *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props)) != 0) + zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; - fp = getf(fd, 0); + fp = getf(fd); if (fp == NULL) { nvlist_free(props); return (EBADF); } - if (getzfsvfs(tofs, &zfsvfs) == 0) { - if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { - VFS_RELE(zfsvfs->z_vfs); - zfsvfs = NULL; - error = EBUSY; - goto out; + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { + if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && + !dsl_prop_get_hasrecvd(os)) { + first_recvd_props = B_TRUE; } + /* - * If new properties are supplied, they are to completely - * replace the existing ones, so stash away the existing ones. - */ - if (props) - (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE); - } else if (props && dmu_objset_open(tofs, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - /* - * Get the props even if there was no zfsvfs (zvol or - * unmounted zpl). + * If new received properties are supplied, they are to + * completely replace the existing received properties, so stash + * away the existing ones. */ - (void) dsl_prop_get_all(os, &origprops, TRUE); + if (dsl_prop_get_received(os, &origprops) == 0) { + nvlist_t *errlist = NULL; + /* + * Don't bother writing a property if its value won't + * change (and avoid the unnecessary security checks). + * + * The first receive after SPA_VERSION_RECVD_PROPS is a + * special case where we blow away all local properties + * regardless. + */ + if (!first_recvd_props) + props_reduce(props, origprops); + if (zfs_check_clearable(tofs, origprops, + &errlist) != 0) + (void) nvlist_merge(errors, errlist, 0); + nvlist_free(errlist); + } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); } if (zc->zc_string[0]) { - error = dmu_objset_open(zc->zc_string, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &origin); + error = dmu_objset_hold(zc->zc_string, FTAG, &origin); if (error) goto out; } - error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, - force, origin, zfsvfs != NULL, &drc); + error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, + &zc->zc_begin_record, force, origin, &drc); if (origin) - dmu_objset_close(origin); + dmu_objset_rele(origin, FTAG); if (error) goto out; /* - * Reset properties. We do this before we receive the stream - * so that the properties are applied to the new data. + * Set properties before we receive the stream so that they are applied + * to the new data. Note that we must call dmu_recv_stream() if + * dmu_recv_begin() succeeds. */ if (props) { - clear_props(tofs, origprops, props); + nvlist_t *errlist; + + if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { + if (drc.drc_newfs) { + if (spa_version(os->os_spa) >= + SPA_VERSION_RECVD_PROPS) + first_recvd_props = B_TRUE; + } else if (origprops != NULL) { + if (clear_received_props(os, tofs, origprops, + first_recvd_props ? NULL : props) != 0) + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } else { + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } + dsl_prop_set_hasrecvd(os); + } else if (!drc.drc_newfs) { + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } + + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + props, &errlist); + (void) nvlist_merge(errors, errlist, 0); + nvlist_free(errlist); + } + + if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) { /* - * XXX - Note, this is all-or-nothing; should be best-effort. + * Caller made zc->zc_nvlist_dst less than the minimum expected + * size or supplied an invalid address. */ - (void) zfs_set_prop_nvlist(tofs, props); + props_error = EINVAL; } off = fp->f_offset; - error = dmu_recv_stream(&drc, fp, &off); + error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd, + &zc->zc_action_handle); - if (error == 0 && zfsvfs) { - char *osname; - int mode; + if (error == 0) { + zfsvfs_t *zfsvfs = NULL; - /* online recv */ - osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - error = zfs_suspend_fs(zfsvfs, osname, &mode); - if (error == 0) { - int resume_err; + if (getzfsvfs(tofs, &zfsvfs) == 0) { + /* online recv */ + int end_err; - error = dmu_recv_end(&drc); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); - error = error ? error : resume_err; + error = zfs_suspend_fs(zfsvfs); + /* + * If the suspend fails, then the recv_end will + * likely also fail, and clean up after itself. + */ + end_err = dmu_recv_end(&drc); + if (error == 0) + error = zfs_resume_fs(zfsvfs, tofs); + error = error ? error : end_err; + VFS_RELE(zfsvfs->z_vfs); } else { - dmu_recv_abort_cleanup(&drc); + error = dmu_recv_end(&drc); } - kmem_free(osname, MAXNAMELEN); - } else if (error == 0) { - error = dmu_recv_end(&drc); } zc->zc_cookie = off - fp->f_offset; if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; +#ifdef DEBUG + if (zfs_ioc_recv_inject_err) { + zfs_ioc_recv_inject_err = B_FALSE; + error = 1; + } +#endif /* * On error, restore the original props. */ if (error && props) { - clear_props(tofs, props, NULL); - (void) zfs_set_prop_nvlist(tofs, origprops); + if (dmu_objset_hold(tofs, FTAG, &os) == 0) { + if (clear_received_props(os, tofs, props, NULL) != 0) { + /* + * We failed to clear the received properties. + * Since we may have left a $recvd value on the + * system, we can't clear the $hasrecvd flag. + */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } else if (first_recvd_props) { + dsl_prop_unset_hasrecvd(os); + } + dmu_objset_rele(os, FTAG); + } else if (!drc.drc_newfs) { + /* We failed to clear the received properties. */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } + + if (origprops == NULL && !drc.drc_newfs) { + /* We failed to stash the original properties. */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } + + /* + * dsl_props_set() will not convert RECEIVED to LOCAL on or + * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL + * explictly if we're restoring local properties cleared in the + * first new-style receive. + */ + if (origprops != NULL && + zfs_set_prop_nvlist(tofs, (first_recvd_props ? + ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), + origprops, NULL) != 0) { + /* + * We stashed the original properties but failed to + * restore them. + */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } } out: - if (zfsvfs) { - mutex_exit(&zfsvfs->z_online_recv_lock); - VFS_RELE(zfsvfs->z_vfs); - } nvlist_free(props); nvlist_free(origprops); - releasef(fp); + nvlist_free(errors); + releasef(fd); + + if (error == 0) + error = props_error; + return (error); } /* * inputs: * zc_name name of snapshot to send - * zc_value short name of incremental fromsnap (may be empty) * zc_cookie file descriptor to send stream to - * zc_obj fromorigin flag (mutually exclusive with zc_value) + * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) + * zc_sendobj objsetid of snapshot to send + * zc_fromobj objsetid of incremental fromsnap (may be zero) * * outputs: none */ @@ -2886,36 +3785,55 @@ zfs_ioc_send(zfs_cmd_t *zc) file_t *fp; int error; offset_t off; + dsl_dataset_t *ds; + dsl_dataset_t *dsfrom = NULL; + spa_t *spa; + dsl_pool_t *dp; - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &tosnap); + error = spa_open(zc->zc_name, &spa, FTAG); if (error) return (error); - if (zc->zc_value[0] != '\0') { - char *buf; - char *cp; - - buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strncpy(buf, zc->zc_name, MAXPATHLEN); - cp = strchr(buf, '@'); - if (cp) - *(cp+1) = 0; - (void) strlcat(buf, zc->zc_value, MAXPATHLEN); - error = dmu_objset_open(buf, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &fromsnap); - kmem_free(buf, MAXPATHLEN); + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) { + spa_close(spa, FTAG); + return (error); + } + + error = dmu_objset_from_ds(ds, &tosnap); + if (error) { + dsl_dataset_rele(ds, FTAG); + spa_close(spa, FTAG); + return (error); + } + + if (zc->zc_fromobj != 0) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + if (error) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + error = dmu_objset_from_ds(dsfrom, &fromsnap); if (error) { - dmu_objset_close(tosnap); + dsl_dataset_rele(dsfrom, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } + } else { + spa_close(spa, FTAG); } - fp = getf(zc->zc_cookie, 1); + fp = getf(zc->zc_cookie); if (fp == NULL) { - dmu_objset_close(tosnap); - if (fromsnap) - dmu_objset_close(fromsnap); + dsl_dataset_rele(ds, FTAG); + if (dsfrom) + dsl_dataset_rele(dsfrom, FTAG); return (EBADF); } @@ -2924,10 +3842,10 @@ zfs_ioc_send(zfs_cmd_t *zc) if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; - releasef(fp); - if (fromsnap) - dmu_objset_close(fromsnap); - dmu_objset_close(tosnap); + releasef(zc->zc_cookie); + if (dsfrom) + dsl_dataset_rele(dsfrom, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } @@ -3003,16 +3921,41 @@ zfs_ioc_clear(zfs_cmd_t *zc) mutex_exit(&spa_namespace_lock); return (EIO); } - if (spa->spa_log_state == SPA_LOG_MISSING) { + if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ - spa->spa_log_state = SPA_LOG_CLEAR; + spa_set_log_state(spa, SPA_LOG_CLEAR); } + spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + if (zc->zc_cookie & ZPOOL_NO_REWIND) { + error = spa_open(zc->zc_name, &spa, FTAG); + } else { + nvlist_t *policy; + nvlist_t *config = NULL; + + if (zc->zc_nvlist_src == 0) + return (EINVAL); + + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { + error = spa_open_rewind(zc->zc_name, &spa, FTAG, + policy, &config); + if (config != NULL) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + nvlist_free(config); + } + nvlist_free(policy); + } + } + + if (error) return (error); - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; @@ -3045,7 +3988,8 @@ zfs_ioc_clear(zfs_cmd_t *zc) * zc_name name of filesystem * zc_value name of origin snapshot * - * outputs: none + * outputs: + * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) @@ -3061,7 +4005,7 @@ zfs_ioc_promote(zfs_cmd_t *zc) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); - return (dsl_dataset_promote(zc->zc_name)); + return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* @@ -3085,7 +4029,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); - error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs); + error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error) return (error); @@ -3111,13 +4055,15 @@ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; - int error; + int bufsize = zc->zc_nvlist_dst_size; + + if (bufsize <= 0) + return (ENOMEM); - error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs); + int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error) return (error); - int bufsize = zc->zc_nvlist_dst_size; void *buf = kmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, @@ -3145,34 +4091,31 @@ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; - int error; + int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { - if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) { + if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ - int mode; - error = zfs_suspend_fs(zfsvfs, NULL, &mode); - if (error == 0) { - error = zfs_resume_fs(zfsvfs, - zc->zc_name, mode); - } + error = zfs_suspend_fs(zfsvfs); + if (error == 0) + error = zfs_resume_fs(zfsvfs, zc->zc_name); } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); VFS_RELE(zfsvfs->z_vfs); } else { - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_USER, &os); + /* XXX kind of reading contents without owning */ + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error) return (error); error = dmu_objset_userspace_upgrade(os); - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); } return (error); @@ -3219,7 +4162,7 @@ zfs_init_sharefs() } return (0); } -#endif /* sun */ +#endif /* sun */ static int zfs_ioc_share(zfs_cmd_t *zc) @@ -3302,27 +4245,135 @@ zfs_ioc_share(zfs_cmd_t *zc) break; } - opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || - zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? - SHAREFS_ADD : SHAREFS_REMOVE; + opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || + zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? + SHAREFS_ADD : SHAREFS_REMOVE; + + /* + * Add or remove share from sharetab + */ + error = zshare_fs(opcode, + (void *)(uintptr_t)zc->zc_share.z_sharedata, + zc->zc_share.z_sharemax); + + return (error); + +#else /* !sun */ + return (ENOSYS); +#endif /* !sun */ +} + +ace_t full_access[] = { + {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} +}; + +/* + * inputs: + * zc_name name of containing filesystem + * zc_obj object # beyond which we want next in-use object # + * + * outputs: + * zc_obj next in-use object # + */ +static int +zfs_ioc_next_obj(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error) + return (error); + + error = dmu_object_next(os, &zc->zc_obj, B_FALSE, + os->os_dsl_dataset->ds_phys->ds_prev_snap_txg); + + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value prefix name for snapshot + * zc_cleanup_fd cleanup-on-exit file descriptor for calling process + * + * outputs: + */ +static int +zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) +{ + char *snap_name; + int error; + + snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, + (u_longlong_t)ddi_get_lbolt64()); + + if (strlen(snap_name) >= MAXNAMELEN) { + strfree(snap_name); + return (E2BIG); + } + + error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name, + NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd); + if (error != 0) { + strfree(snap_name); + return (error); + } + + (void) strcpy(zc->zc_value, snap_name); + strfree(snap_name); + return (0); +} + +/* + * inputs: + * zc_name name of "to" snapshot + * zc_value name of "from" snapshot + * zc_cookie file descriptor to write diff data on + * + * outputs: + * dmu_diff_record_t's to the file descriptor + */ +static int +zfs_ioc_diff(zfs_cmd_t *zc) +{ + objset_t *fromsnap; + objset_t *tosnap; + file_t *fp; + offset_t off; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); + if (error) + return (error); + + error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); + if (error) { + dmu_objset_rele(tosnap, FTAG); + return (error); + } + + fp = getf(zc->zc_cookie); + if (fp == NULL) { + dmu_objset_rele(fromsnap, FTAG); + dmu_objset_rele(tosnap, FTAG); + return (EBADF); + } - /* - * Add or remove share from sharetab - */ - error = zshare_fs(opcode, - (void *)(uintptr_t)zc->zc_share.z_sharedata, - zc->zc_share.z_sharemax); + off = fp->f_offset; + error = dmu_diff(tosnap, fromsnap, fp, &off); + + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + releasef(zc->zc_cookie); + + dmu_objset_rele(fromsnap, FTAG); + dmu_objset_rele(tosnap, FTAG); return (error); -#else /* sun */ - return (ENOSYS); -#endif /* sun */ } -ace_t full_access[] = { - {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} -}; - #ifdef sun /* * Remove all ACL files in shares dir @@ -3368,7 +4419,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) /* Now make sure mntpnt and dataset are ZFS */ - if (vp->v_vfsp->vfs_fstype != zfsfstype || + if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); @@ -3377,7 +4428,6 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) dzp = VTOZ(vp); zfsvfs = dzp->z_zfsvfs; - ZFS_ENTER(zfsvfs); /* @@ -3440,7 +4490,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) case ZFS_SMB_ACL_RENAME: if ((error = get_nvlist(zc->zc_nvlist_src, - zc->zc_nvlist_src_size, &nvlist)) != 0) { + zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); @@ -3451,6 +4501,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); + nvlist_free(nvlist); return (error); } error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, @@ -3479,6 +4530,127 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) } /* + * inputs: + * zc_name name of filesystem + * zc_value short name of snap + * zc_string user-supplied tag for this hold + * zc_cookie recursive flag + * zc_temphold set if hold is temporary + * zc_cleanup_fd cleanup-on-exit file descriptor for calling process + * zc_sendobj if non-zero, the objid for zc_name@zc_value + * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg + * + * outputs: none + */ +static int +zfs_ioc_hold(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + spa_t *spa; + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + minor_t minor = 0; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + if (zc->zc_sendobj == 0) { + return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, + zc->zc_string, recursive, zc->zc_temphold, + zc->zc_cleanup_fd)); + } + + if (recursive) + return (EINVAL); + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error) + return (error); + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + if (error) + return (error); + + /* + * Until we have a hold on this snapshot, it's possible that + * zc_sendobj could've been destroyed and reused as part + * of a later txg. Make sure we're looking at the right object. + */ + if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { + dsl_dataset_rele(ds, FTAG); + return (ENOENT); + } + + if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { + error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (error) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + } + + error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, + zc->zc_temphold); + if (minor != 0) { + if (error == 0) { + dsl_register_onexit_hold_cleanup(ds, zc->zc_string, + minor); + } + zfs_onexit_fd_rele(zc->zc_cleanup_fd); + } + dsl_dataset_rele(ds, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of dataset from which we're releasing a user hold + * zc_value short name of snap + * zc_string user-supplied tag for this hold + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_release(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, + zc->zc_string, recursive)); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of snapshot holds + */ +static int +zfs_ioc_get_holds(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. @@ -3514,7 +4686,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_FALSE }, { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE, + { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_TRUE }, { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, B_FALSE }, @@ -3534,6 +4706,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_TRUE }, { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, B_TRUE }, + { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_TRUE }, { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, B_TRUE }, { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, @@ -3543,10 +4717,6 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, B_TRUE }, { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE, - B_FALSE }, { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE }, { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, B_TRUE}, @@ -3566,14 +4736,14 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE }, { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, - B_TRUE }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME, + B_TRUE, B_TRUE }, { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, + B_TRUE }, { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_TRUE }, { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, @@ -3582,13 +4752,9 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_TRUE }, { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE, - B_FALSE }, { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE }, { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE }, - { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE }, { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, B_FALSE }, { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, @@ -3597,15 +4763,30 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { DATASET_NAME, B_FALSE, B_FALSE }, { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, DATASET_NAME, B_FALSE, B_TRUE }, - { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_TRUE } + { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME, + B_FALSE, B_FALSE }, + { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE } }; int pool_status_check(const char *name, zfs_ioc_namecheck_t type) { spa_t *spa; - char pool[ZFS_MAXNAMELEN]; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME); @@ -3619,27 +4800,157 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type) return (error); } +/* + * Find a free minor number. + */ +minor_t +zfsdev_minor_alloc(void) +{ + static minor_t last_minor; + minor_t m; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + for (m = last_minor + 1; m != last_minor; m++) { + if (m > ZFSDEV_MAX_MINOR) + m = 1; + if (ddi_get_soft_state(zfsdev_state, m) == NULL) { + last_minor = m; + return (m); + } + } + + return (0); +} + +static int +zfs_ctldev_init(struct cdev *devp) +{ + minor_t minor; + zfs_soft_state_t *zs; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (ENXIO); + + if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) + return (EAGAIN); + + devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close); + + zs = ddi_get_soft_state(zfsdev_state, minor); + zs->zss_type = ZSST_CTLDEV; + zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); + + return (0); +} + +static void +zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) +{ + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + zfs_onexit_destroy(zo); + ddi_soft_state_free(zfsdev_state, minor); +} + +void * +zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) +{ + zfs_soft_state_t *zp; + + zp = ddi_get_soft_state(zfsdev_state, minor); + if (zp == NULL || zp->zss_type != which) + return (NULL); + + return (zp->zss_data); +} + +static int +zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) +{ + int error = 0; + +#ifdef sun + if (getminor(*devp) != 0) + return (zvol_open(devp, flag, otyp, cr)); +#endif + + /* This is the control device. Allocate a new minor if requested. */ + if (flag & FEXCL) { + mutex_enter(&zfsdev_state_lock); + error = zfs_ctldev_init(devp); + mutex_exit(&zfsdev_state_lock); + } + + return (error); +} + +static void +zfsdev_close(void *data) +{ + zfs_onexit_t *zo; + minor_t minor = (minor_t)(uintptr_t)data; + + if (minor == 0) + return; + + mutex_enter(&zfsdev_state_lock); + zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (zo == NULL) { + mutex_exit(&zfsdev_state_lock); + return; + } + zfs_ctldev_destroy(zo, minor); + mutex_exit(&zfsdev_state_lock); +} + static int zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { - zfs_cmd_t *zc = (void *)addr; + zfs_cmd_t *zc; uint_t vec; - int error; + int cflag, error, len; + + cflag = ZFS_CMD_COMPAT_NONE; + len = IOCPARM_LEN(cmd); /* * Check if we have sufficient kernel memory allocated * for the zfs_cmd_t request. Bail out if not so we * will not access undefined memory region. */ - if (IOCPARM_LEN(cmd) < sizeof(zfs_cmd_t)) - return (EINVAL); + if (len < sizeof(zfs_cmd_t)) + if (len == sizeof(zfs_cmd_v15_t)) { + cflag = ZFS_CMD_COMPAT_V15; + vec = zfs_ioctl_v15_to_v28[ZFS_IOC(cmd)]; + } else + return (EINVAL); + else + vec = ZFS_IOC(cmd); - vec = ZFS_IOC(cmd); + if (cflag != ZFS_CMD_COMPAT_NONE) { + if (vec == ZFS_IOC_COMPAT_PASS) + return (0); + else if (vec == ZFS_IOC_COMPAT_FAIL) + return (ENOTSUP); + } if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (EINVAL); + if (cflag != ZFS_CMD_COMPAT_NONE) { + zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); + bzero(zc, sizeof(zfs_cmd_t)); + zfs_cmd_compat_get(zc, addr, cflag); + zfs_ioctl_compat_pre(zc, &vec, cflag); + } else { + zc = (void *)addr; + } + error = zfs_ioc_vec[vec].zvec_secpolicy(zc, td->td_ucred); /* @@ -3648,6 +4959,7 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, */ if (error == 0) { zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + zc->zc_iflags = flag & FKIOCTL; switch (zfs_ioc_vec[vec].zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) @@ -3678,9 +4990,68 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, zfs_log_history(zc); } + if (cflag != ZFS_CMD_COMPAT_NONE) { + zfs_ioctl_compat_post(zc, ZFS_IOC(cmd), cflag); + zfs_cmd_compat_put(zc, addr, cflag); + kmem_free(zc, sizeof(zfs_cmd_t)); + } + return (error); } +#ifdef sun +static int +zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, + DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + zfs_dip = dip; + + ddi_report_dev(dip); + + return (DDI_SUCCESS); +} + +static int +zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (spa_busy() || zfs_busy() || zvol_busy()) + return (DDI_FAILURE); + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + zfs_dip = NULL; + + ddi_prop_remove_all(dip); + ddi_remove_minor_node(dip, NULL); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = zfs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + return (DDI_SUCCESS); + } + + return (DDI_FAILURE); +} +#endif /* sun */ + /* * OK, so this is a little weird. * @@ -3690,8 +5061,60 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, * /dev/zfs has basically nothing to do except serve up ioctls, * so most of the standard driver entry points are in zvol.c. */ +#ifdef sun +static struct cb_ops zfs_cb_ops = { + zfsdev_open, /* open */ + zfsdev_close, /* close */ + zvol_strategy, /* strategy */ + nodev, /* print */ + zvol_dump, /* dump */ + zvol_read, /* read */ + zvol_write, /* write */ + zfsdev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ + CB_REV, /* version */ + nodev, /* async read */ + nodev, /* async write */ +}; + +static struct dev_ops zfs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + zfs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + zfs_attach, /* attach */ + zfs_detach, /* detach */ + nodev, /* reset */ + &zfs_cb_ops, /* driver operations */ + NULL, /* no bus operations */ + NULL, /* power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv zfs_modldrv = { + &mod_driverops, + "ZFS storage pool", + &zfs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&zfs_modlfs, + (void *)&zfs_modldrv, + NULL +}; +#endif /* sun */ + static struct cdevsw zfs_cdevsw = { .d_version = D_VERSION, + .d_open = zfsdev_open, .d_ioctl = zfsdev_ioctl, .d_name = ZFS_DEV_NAME }; @@ -3716,6 +5139,69 @@ struct proc *zfsproc; uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; +#ifdef sun +int +_init(void) +{ + int error; + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + + if ((error = mod_install(&modlinkage)) != 0) { + zvol_fini(); + zfs_fini(); + spa_fini(); + return (error); + } + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, NULL); + + error = ldi_ident_from_mod(&modlinkage, &zfs_li); + ASSERT(error == 0); + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +int +_fini(void) +{ + int error; + + if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + zvol_fini(); + zfs_fini(); + spa_fini(); + if (zfs_nfsshare_inited) + (void) ddi_modclose(nfs_mod); + if (zfs_smbshare_inited) + (void) ddi_modclose(smbsrv_mod); + if (zfs_nfsshare_inited || zfs_smbshare_inited) + (void) ddi_modclose(sharefs_mod); + + tsd_destroy(&zfs_fsyncer_key); + ldi_ident_release(zfs_li); + zfs_li = NULL; + mutex_destroy(&zfs_share_lock); + + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} +#endif /* sun */ + static int zfs_modevent(module_t mod, int type, void *unused __unused) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c index 3105088..29378d8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -44,14 +43,6 @@ #include #include -#define ZFS_HANDLE_REPLAY(zilog, tx) \ - if (zilog->zl_replay) { \ - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \ - zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \ - zilog->zl_replaying_seq; \ - return; \ - } - /* * These zfs_log_* functions must be called within a dmu tx, in one * of 2 contexts depending on zilog->z_replay: @@ -180,6 +171,15 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + *attrs |= (xoap->xoa_reparse == 0) ? 0 : + XAT0_REPARSE; + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + *attrs |= (xoap->xoa_offline == 0) ? 0 : + XAT0_OFFLINE; + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + *attrs |= (xoap->xoa_sparse == 0) ? 0 : + XAT0_SPARSE; } static void * @@ -241,7 +241,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, zfs_fuid_info_t *fuidp, vattr_t *vap) { itx_t *itx; - uint64_t seq; lr_create_t *lr; lr_acl_create_t *lracl; size_t aclsize; @@ -253,11 +252,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, size_t namesize = strlen(name) + 1; size_t fuidsz = 0; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - /* * If we have FUIDs present then add in space for * domains and ACE fuid's if any. @@ -288,21 +285,25 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) { - lr->lr_uid = (uint64_t)zp->z_phys->zp_uid; + lr->lr_mode = zp->z_mode; + if (!IS_EPHEMERAL(zp->z_uid)) { + lr->lr_uid = (uint64_t)zp->z_uid; } else { lr->lr_uid = fuidp->z_fuid_owner; } - if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) { - lr->lr_gid = (uint64_t)zp->z_phys->zp_gid; + if (!IS_EPHEMERAL(zp->z_gid)) { + lr->lr_gid = (uint64_t)zp->z_gid; } else { lr->lr_gid = fuidp->z_fuid_group; } - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; - lr->lr_rdev = zp->z_phys->zp_rdev; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); + + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev, + sizeof (lr->lr_rdev)) != 0) + lr->lr_rdev = 0; /* * Fill in xvattr info if any @@ -341,9 +342,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ bcopy(name, end, namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -351,25 +350,23 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name) + znode_t *dzp, char *name, uint64_t foid) { itx_t *itx; - uint64_t seq; lr_remove_t *lr; size_t namesize = strlen(name) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_remove_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; bcopy(name, (char *)(lr + 1), namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; + itx->itx_oid = foid; + + zil_itx_assign(zilog, itx, tx); } /* @@ -380,24 +377,19 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name) { itx_t *itx; - uint64_t seq; lr_link_t *lr; size_t namesize = strlen(name) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_link_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_link_obj = zp->z_id; bcopy(name, (char *)(lr + 1), namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -408,32 +400,28 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link) { itx_t *itx; - uint64_t seq; lr_create_t *lr; size_t namesize = strlen(name) + 1; size_t linksize = strlen(link) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - lr->lr_uid = zp->z_phys->zp_uid; - lr->lr_gid = zp->z_phys->zp_gid; - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + lr->lr_uid = zp->z_uid; + lr->lr_gid = zp->z_gid; + lr->lr_mode = zp->z_mode; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); bcopy(name, (char *)(lr + 1), namesize); bcopy(link, (char *)(lr + 1) + namesize, linksize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -444,27 +432,22 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) { itx_t *itx; - uint64_t seq; lr_rename_t *lr; size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); lr = (lr_rename_t *)&itx->itx_lr; lr->lr_sdoid = sdzp->z_id; lr->lr_tdoid = tdzp->z_id; bcopy(sname, (char *)(lr + 1), snamesize); bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + itx->itx_oid = szp->z_id; - seq = zil_itx_assign(zilog, itx, tx); - sdzp->z_last_itx = seq; - tdzp->z_last_itx = seq; - szp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -472,9 +455,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ ssize_t zfs_immediate_write_sz = 32768; -#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ - sizeof (lr_write_t)) - void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) @@ -482,37 +462,17 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; + ssize_t immediate_write_sz; - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + ? 0 : zfs_immediate_write_sz; - /* - * Writes are handled in three different ways: - * - * WR_INDIRECT: - * In this mode, if we need to commit the write later, then the block - * is immediately written into the file system (using dmu_sync), - * and a pointer to the block is put into the log record. - * When the txg commits the block is linked in. - * This saves additionally writing the data into the log record. - * There are a few requirements for this to occur: - * - write is greater than zfs_immediate_write_sz - * - not using slogs (as slogs are assumed to always be faster - * than writing into the main pool) - * - the write occupies only one block - * WR_COPIED: - * If we know we'll immediately be committing the - * transaction (FSYNC or FDSYNC), the we allocate a larger - * log record here for the data and copy the data in. - * WR_NEED_COPY: - * Otherwise we don't allocate a buffer, and *if* we need to - * flush the write later then a buffer is allocated and - * we retrieve the data using the dmu. - */ - slogging = spa_has_slogs(zilog->zl_spa); - if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz) + slogging = spa_has_slogs(zilog->zl_spa) && + (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; @@ -541,8 +501,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { - kmem_free(itx, offsetof(itx_t, itx_lr) + - itx->itx_lr.lrc_reclen); + zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; @@ -559,13 +518,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx->itx_private = zp->z_zfsvfs; - if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || - (ioflag & (FSYNC | FDSYNC))) - itx->itx_sync = B_TRUE; - else + if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && + (fsync_cnt == 0)) itx->itx_sync = B_FALSE; - zp->z_last_itx = zil_itx_assign(zilog, itx, tx); + zil_itx_assign(zilog, itx, tx); off += len; resid -= len; @@ -580,14 +537,11 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len) { itx_t *itx; - uint64_t seq; lr_truncate_t *lr; - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; @@ -595,8 +549,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, lr->lr_length = len; itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -607,18 +560,14 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) { itx_t *itx; - uint64_t seq; lr_setattr_t *lr; xvattr_t *xvap = (xvattr_t *)vap; size_t recsize = sizeof (lr_setattr_t); void *start; - - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - /* * If XVATTR set, then log record size needs to allow * for lr_attr_t + xvattr mask, mapsize and create time @@ -662,8 +611,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, (void) zfs_log_fuid_domains(fuidp, start); itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -674,7 +622,6 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) { itx_t *itx; - uint64_t seq; lr_acl_v0_t *lrv0; lr_acl_t *lr; int txtype; @@ -682,11 +629,9 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, size_t txsize; size_t aclbytes = vsecp->vsa_aclentsz; - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; - ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? TX_ACL_V0 : TX_ACL; @@ -732,6 +677,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, } itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c new file mode 100644 index 0000000..ca0acfd --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c @@ -0,0 +1,252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS kernel routines may add/delete callback routines to be invoked + * upon process exit (triggered via the close operation from the /dev/zfs + * driver). + * + * These cleanup callbacks are intended to allow for the accumulation + * of kernel state across multiple ioctls. User processes participate + * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a + * clone-open, generating a unique minor number. The process then passes + * along that file descriptor to each ioctl that might have a cleanup operation. + * + * Consumers of the onexit routines should call zfs_onexit_fd_hold() early + * on to validate the given fd and add a reference to its file table entry. + * This allows the consumer to do its work and then add a callback, knowing + * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers + * should call zfs_onexit_fd_rele(). + * + * A simple example is zfs_ioc_recv(), where we might create an AVL tree + * with dataset/GUID mappings and then reuse that tree on subsequent + * zfs_ioc_recv() calls. + * + * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() + * the AVL tree and pass it along with a callback function to + * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the + * callback and return an action handle. + * + * The action handle is then passed from user space to subsequent + * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree + * by calling zfs_onexit_cb_data() with the device minor number and + * action handle. + * + * If the user process exits abnormally, the callback is invoked implicitly + * as part of the driver close operation. Once the user space process is + * finished with the accumulated kernel state, it can also just call close(2) + * on the cleanup fd to trigger the cleanup callback. + */ + +void +zfs_onexit_init(zfs_onexit_t **zop) +{ + zfs_onexit_t *zo; + + zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); + mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), + offsetof(zfs_onexit_action_node_t, za_link)); +} + +void +zfs_onexit_destroy(zfs_onexit_t *zo) +{ + zfs_onexit_action_node_t *ap; + + mutex_enter(&zo->zo_lock); + while ((ap = list_head(&zo->zo_actions)) != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + mutex_enter(&zo->zo_lock); + } + mutex_exit(&zo->zo_lock); + + list_destroy(&zo->zo_actions); + mutex_destroy(&zo->zo_lock); + kmem_free(zo, sizeof (zfs_onexit_t)); +} + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (*zo == NULL) + return (EBADF); + + return (0); +} + +/* + * Consumers might need to operate by minor number instead of fd, since + * they might be running in another thread (e.g. txg_sync_thread). Callers + * of this function must call zfs_onexit_fd_rele() when they're finished + * using the minor number. + */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + file_t *fp, *tmpfp; + zfs_onexit_t *zo; + void *data; + int error; + + fp = getf(fd); + if (fp == NULL) + return (EBADF); + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + if (error == 0) + *minorp = (minor_t)(uintptr_t)data; + curthread->td_fpop = tmpfp; + if (error != 0) + return (error); + return (zfs_onexit_minor_to_state(*minorp, &zo)); +} + +void +zfs_onexit_fd_rele(int fd) +{ + releasef(fd); +} + +/* + * Add a callback to be invoked when the calling process exits. + */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); + list_link_init(&ap->za_link); + ap->za_func = func; + ap->za_data = data; + + mutex_enter(&zo->zo_lock); + list_insert_tail(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (action_handle) + *action_handle = (uint64_t)(uintptr_t)ap; + + return (0); +} + +static zfs_onexit_action_node_t * +zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) +{ + zfs_onexit_action_node_t *match; + zfs_onexit_action_node_t *ap; + list_t *l; + + ASSERT(MUTEX_HELD(&zo->zo_lock)); + + match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; + l = &zo->zo_actions; + for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { + if (match == ap) + break; + } + return (ap); +} + +/* + * Delete the callback, triggering it first if 'fire' is set. + */ +int +zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (fire) + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + } else { + mutex_exit(&zo->zo_lock); + error = ENOENT; + } + + return (error); +} + +/* + * Return the data associated with this callback. This allows consumers + * of the cleanup-on-exit interfaces to stash kernel data across system + * calls, knowing that it will be cleaned up if the calling process exits. + */ +int +zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + *data = NULL; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) + *data = ap->za_data; + else + error = ENOENT; + mutex_exit(&zo->zo_lock); + + return (error); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c index c965247..ebea17a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -132,6 +129,12 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); } static int @@ -516,7 +519,6 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); break; case TX_MKXATTR: - name = (char *)(lr + 1); error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); break; case TX_SYMLINK: @@ -531,10 +533,8 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) VOP_UNLOCK(ZTOV(dzp), 0); out: - if (error == 0 && vp != NULL) { - VOP_UNLOCK(vp, 0); - VN_RELE(vp); - } + if (error == 0 && vp != NULL) + VN_URELE(vp); VN_RELE(ZTOV(dzp)); @@ -588,6 +588,7 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) } vput(vp); VOP_UNLOCK(ZTOV(dzp), 0); + fail: VN_RELE(ZTOV(dzp)); @@ -616,6 +617,7 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; + cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; @@ -710,7 +712,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) znode_t *zp; int error; ssize_t resid; - uint64_t orig_eof, eod; + uint64_t eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -725,15 +727,10 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) error = 0; return (error); } - orig_eof = zp->z_phys->zp_size; - eod = lr->lr_offset + lr->lr_length; /* end of data for this write */ - - /* If it's a dmu_sync() block get the data and write the whole block */ - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) - zil_get_replay_data(zfsvfs->z_log, lr); - error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, - lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + offset = lr->lr_offset; + length = lr->lr_length; + eod = offset + length; /* end of data for this write */ /* * This may be a write from a dmu_sync() for a whole block, @@ -741,12 +738,29 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) * We can't just replay what was written for this TX_WRITE as * a future TX_WRITE2 may extend the eof and the data for that * write needs to be there. So we write the whole block and - * reduce the eof. + * reduce the eof. This needs to be done within the single dmu + * transaction created within vn_rdwr -> zfs_write. So a possible + * new end of file is passed through in zfsvfs->z_replay_eof */ - if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */ - zp->z_phys->zp_size = eod; + + zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + if (zp->z_size < eod) + zfsvfs->z_replay_eof = eod; + } + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); VN_RELE(ZTOV(zp)); + zfsvfs->z_replay_eof = 0; /* safety */ return (error); } @@ -767,21 +781,34 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log writes out of order, it's possible the - * file has been removed. In this case just drop the write - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } +top: end = lr->lr_offset + lr->lr_length; - if (end > zp->z_phys->zp_size) { - ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz); - zp->z_phys->zp_size = end; + if (end > zp->z_size) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + zp->z_size = end; + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + VN_RELE(ZTOV(zp)); + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + return (error); + } + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + + /* Ensure the replayed seq is updated */ + (void) zil_replaying(zfsvfs->z_log, tx); + + dmu_tx_commit(tx); } VN_RELE(ZTOV(zp)); @@ -792,9 +819,33 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) static int zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) { +#ifdef sun + znode_t *zp; + flock64_t fl; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&fl, sizeof (fl)); + fl.l_type = F_WRLCK; + fl.l_whence = 0; + fl.l_start = lr->lr_offset; + fl.l_len = lr->lr_length; + error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, + lr->lr_offset, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +#else /* !sun */ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); return (EOPNOTSUPP); +#endif /* !sun */ } static int @@ -816,16 +867,8 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log setattrs out of order, it's possible the - * file has been removed. In this case just drop the setattr - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); @@ -874,16 +917,8 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap) zfs_oldace_byteswap(ace, lr->lr_aclcnt); } - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log acls out of order, it's possible the - * file has been removed. In this case just drop the acl - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; @@ -935,16 +970,8 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) } } - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log acls out of order, it's possible the - * file has been removed. In this case just drop the acl - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } #ifdef TODO bzero(&vsa, sizeof (vsa)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c index 4de8d8a..7fd8f60 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -112,7 +112,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new) * Range locking is also used by zvol and uses a * dummied up znode. However, for zvol, we don't need to * append or grow blocksize, and besides we don't have - * a z_phys or z_zfsvfs - so skip that processing. + * a "sa" data or z_zfsvfs - so skip that processing. * * Yes, this is ugly, and would be solved by not handling * grow or append in range lock code. If that was done then @@ -125,14 +125,14 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new) * This is done under z_range_lock to avoid races. */ if (new->r_type == RL_APPEND) - new->r_off = zp->z_phys->zp_size; + new->r_off = zp->z_size; /* * If we need to grow the block size then grab the whole * file range. This is also done under z_range_lock to * avoid races. */ - end_size = MAX(zp->z_phys->zp_size, new->r_off + len); + end_size = MAX(zp->z_size, new->r_off + len); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { new->r_off = 0; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c new file mode 100644 index 0000000..d141e43 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +/* + * ZPL attribute registration table. + * Order of attributes doesn't matter + * a unique value will be assigned for each + * attribute that is file system specific + * + * This is just the set of ZPL attributes that this + * version of ZFS deals with natively. The file system + * could have other attributes stored in files, but they will be + * ignored. The SA framework will preserve them, just that + * this version of ZFS won't change or delete them. + */ + +sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, + {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, + {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, + {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, + {"ZPL_DACL_ACES", 0, SA_ACL, 0}, + {NULL, 0, 0, 0} +}; + +#ifdef _KERNEL + +int +zfs_sa_readlink(znode_t *zp, uio_t *uio) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + size_t bufsz; + int error; + + bufsz = zp->z_size; + if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { + error = uiomove((caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id, + 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + } + return (error); +} + +void +zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + + if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { + VERIFY(dmu_set_bonus(db, + len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); + if (len) { + bcopy(link, (caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, len); + } + } else { + dmu_buf_t *dbp; + + zfs_grow_blocksize(zp, len, tx); + VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os, + zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)); + + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } +} + +void +zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) { + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp)) != 0) + return; + } else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) + return; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + + if (len <= doi.doi_bonus_size) { + (void) memcpy(xoap->xoa_av_scanstamp, + (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + sizeof (xoap->xoa_av_scanstamp)); + } + } + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); +} + +void +zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), tx)); + else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(db, len, tx) == 0); + (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + + zp->z_pflags |= ZFS_BONUS_SCANSTAMP; + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &zp->z_pflags, sizeof (uint64_t), tx)); + } +} + +/* + * I'm not convinced we should do any of this upgrade. + * since the SA code can read both old/new znode formats + * with probably little to know performance difference. + * + * All new files will be created with the new format. + */ + +void +zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(hdl); + znode_t *zp = sa_get_userdata(hdl); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + sa_bulk_attr_t bulk[20]; + int count = 0; + sa_bulk_attr_t sa_attrs[20] = { 0 }; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t uid, gid, mode, rdev, xattr, parent; + uint64_t crtime[2], mtime[2], ctime[2]; + zfs_acl_phys_t znode_acl; + char scanstamp[AV_SCANSTAMP_SZ]; + boolean_t drop_lock = B_FALSE; + + /* + * No upgrade if ACL isn't cached + * since we won't know which locks are held + * and ready the ACL would require special "locked" + * interfaces that would be messy + */ + if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK) + return; + + /* + * If the z_lock is held and we aren't the owner + * the just return since we don't want to deadlock + * trying to update the status of z_is_sa. This + * file can then be upgraded at a later time. + * + * Otherwise, we know we are doing the + * sa_update() that caused us to enter this function. + */ + if (mutex_owner(&zp->z_lock) != curthread) { + if (mutex_tryenter(&zp->z_lock) == 0) + return; + else + drop_lock = B_TRUE; + } + + /* First do a bulk query of the attributes that aren't cached */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &znode_acl, 88); + + if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) + goto done; + + + /* + * While the order here doesn't matter its best to try and organize + * it is such a way to pick up an already existing layout number + */ + count = 0; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs), + NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, + zp->z_atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &zp->z_acl_cached->z_acl_count, 8); + + if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) + zfs_acl_xform(zp, zp->z_acl_cached, CRED()); + + locate.cb_aclp = zp->z_acl_cached; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); + + if (xattr) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs), + NULL, &xattr, 8); + + /* if scanstamp then add scanstamp */ + + if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { + bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + scanstamp, AV_SCANSTAMP_SZ); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), + NULL, scanstamp, AV_SCANSTAMP_SZ); + zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; + } + + VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); + VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, + count, tx) == 0); + if (znode_acl.z_acl_extern_obj) + VERIFY(0 == dmu_object_free(zfsvfs->z_os, + znode_acl.z_acl_extern_obj, tx)); + + zp->z_is_sa = B_TRUE; +done: + if (drop_lock) + mutex_exit(&zp->z_lock); +} + +void +zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) +{ + if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa) + return; + + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + if (zfs_external_acl(zp)) { + dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, + DMU_OBJECT_END); + } +} + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 287de4c..e9a956c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #include #include #include @@ -45,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -55,17 +57,19 @@ #include #include #include +#include +#include "zfs_comutil.h" struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); -int zfs_super_owner = 0; +int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); -int zfs_debug_level = 0; +int zfs_debug_level; TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, "Debug level"); @@ -74,12 +78,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); -static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD, - &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION"); -static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD, - &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); @@ -156,9 +154,8 @@ zfs_sync(vfs_t *vfsp, int waitfor) } if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, UINT64_MAX, 0); - else - txg_wait_synced(dp, 0); + zil_commit(zfsvfs->z_log, 0); + ZFS_EXIT(zfsvfs); } else { /* @@ -172,6 +169,60 @@ zfs_sync(vfs_t *vfsp, int waitfor) return (0); } +#ifndef __FreeBSD__ +static int +zfs_create_unique_device(dev_t *dev) +{ + major_t new_major; + + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + minor_t start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + if (zfs_minor >= MAXMIN32) { + /* + * If we're still using the real major + * keep out of /dev/zfs and /dev/zvol minor + * number space. If we're using a getudev()'ed + * major number, we can use all of its minors. + */ + if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) + zfs_minor = ZFS_MIN_MINOR; + else + zfs_minor = 0; + } else { + zfs_minor++; + } + *dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(*dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers for the + * current major number. Create a new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique major " + "device number."); + return (-1); + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + return (0); +} +#endif /* !__FreeBSD__ */ + static void atime_changed_cb(void *arg, uint64_t newval) { @@ -313,14 +364,6 @@ vscan_changed_cb(void *arg, uint64_t newval) } static void -acl_mode_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_mode = newval; -} - -static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; @@ -335,11 +378,11 @@ zfs_register_callbacks(vfs_t *vfsp) objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; - int readonly, do_readonly = FALSE; - int setuid, do_setuid = FALSE; - int exec, do_exec = FALSE; - int xattr, do_xattr = FALSE; - int atime, do_atime = FALSE; + int readonly, do_readonly = B_FALSE; + int setuid, do_setuid = B_FALSE; + int exec, do_exec = B_FALSE; + int xattr, do_xattr = B_FALSE; + int atime, do_atime = B_FALSE; int error = 0; ASSERT(vfsp); @@ -360,7 +403,8 @@ zfs_register_callbacks(vfs_t *vfsp) * of mount options, we stash away the current values and * restore them after we register the callbacks. */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { @@ -444,8 +488,6 @@ zfs_register_callbacks(vfs_t *vfsp) error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "aclmode", acl_mode_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "vscan", vscan_changed_cb, zfsvfs); @@ -483,7 +525,6 @@ unregister: (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); @@ -491,62 +532,53 @@ unregister: } -static void -uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, - int64_t delta, dmu_tx_t *tx) -{ - uint64_t used = 0; - char buf[32]; - int err; - uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; - - if (delta == 0) - return; - - (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); - err = zap_lookup(os, obj, buf, 8, 1, &used); - ASSERT(err == 0 || err == ENOENT); - /* no underflow/overflow */ - ASSERT(delta > 0 || used >= -delta); - ASSERT(delta < 0 || used + delta > used); - used += delta; - if (used == 0) - err = zap_remove(os, obj, buf, tx); - else - err = zap_update(os, obj, buf, 8, 1, &used, tx); - ASSERT(err == 0); -} - -static void -zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype, - void *oldbonus, void *newbonus, - uint64_t oldused, uint64_t newused, dmu_tx_t *tx) +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) { - znode_phys_t *oldznp = oldbonus; - znode_phys_t *newznp = newbonus; + znode_phys_t *znp = data; + int error = 0; - if (bonustype != DMU_OT_ZNODE) - return; + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (ENOENT); - /* We charge 512 for the dnode (if it's allocated). */ - if (oldznp->zp_gen != 0) - oldused += DNODE_SIZE; - if (newznp->zp_gen != 0) - newused += DNODE_SIZE; + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (EEXIST); - if (oldznp->zp_uid == newznp->zp_uid) { - uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx); + if (bonustype == DMU_OT_ZNODE) { + *userp = znp->zp_uid; + *groupp = znp->zp_gid; } else { - uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx); - uidacct(os, B_FALSE, newznp->zp_uid, newused, tx); - } + int hdrsize; - if (oldznp->zp_gid == newznp->zp_gid) { - uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx); - } else { - uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx); - uidacct(os, B_TRUE, newznp->zp_gid, newused, tx); + ASSERT(bonustype == DMU_OT_SA); + hdrsize = sa_hdrsize(data); + + if (hdrsize != 0) { + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + } else { + /* + * This should only happen for newly created + * files that haven't had the znode data filled + * in yet. + */ + *userp = 0; + *groupp = 0; + } } + return (error); } static void @@ -733,7 +765,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, } boolean_t -zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; @@ -756,33 +788,48 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) return (used >= quota); } +boolean_t +zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) +{ + uint64_t fuid; + uint64_t quotaobj; + + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + fuid = isgroup ? zp->z_gid : zp->z_uid; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); +} + int -zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) +zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; uint64_t zval; int i, error; + uint64_t sa_obj; - if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL)) - return (error); - if (zval) - mode |= DS_MODE_READONLY; + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); - if (error == EROFS) { - mode |= DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); - } - if (error) + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); + if (error) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); + } /* * Initialize the zfs-specific filesystem structure. * Should probably make this a kmem cache, shuffle fields, * and just bzero up to z_hold_mtx[]. */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; @@ -792,15 +839,15 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error) { goto out; - } else if (zfsvfs->z_version > ZPL_VERSION) { - (void) printf("Mismatched versions: File system " - "is version %llu on-disk format, which is " - "incompatible with this software version %lld!", - (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); + } else if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); error = ENOTSUP; goto out; } - if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) goto out; zfsvfs->z_norm = (int)zval; @@ -822,6 +869,29 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error) + return (error); + } else { + /* + * Pre SA versions file systems should never touch + * either the attribute registration or layout objects. + */ + sa_obj = 0; + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error) + goto out; + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); @@ -857,7 +927,6 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) goto out; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); @@ -867,12 +936,12 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); - *zvp = zfsvfs; + *zfvp = zfsvfs; return (0); out: - dmu_objset_close(os); - *zvp = NULL; + dmu_objset_disown(os, zfsvfs); + *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } @@ -889,15 +958,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) /* * Set the objset user_ptr to track its zfsvfs. */ - mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - if (zil_disable) { - zil_destroy(zfsvfs->z_log, B_FALSE); - zfsvfs->z_log = NULL; - } /* * If we are not mounting (ie: online recv), then we don't @@ -917,37 +982,42 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) else zfs_unlinked_drain(zfsvfs); - if (zfsvfs->z_log) { - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest - * doesn't use readonly mounts, where - * zfs_unlinked_drain() isn't called.) This is because - * ziltest causes spa_sync() to think it's committed, - * but actually it is not, so the intent log contains - * many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated - * in a yet later txg. This would write a "create - * object N" record to the intent log. Normally, this - * would be fine because the spa_sync() would have - * written out the fact that object N is free, before - * we could write the "create object N" intent log - * record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - zfsvfs->z_replay = B_TRUE; - zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); - zfsvfs->z_replay = B_FALSE; + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } @@ -974,7 +1044,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs) zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_online_recv_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); @@ -989,13 +1058,24 @@ static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { - vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int @@ -1009,7 +1089,7 @@ zfs_domount(vfs_t *vfsp, char *osname) ASSERT(vfsp); ASSERT(osname); - error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs); + error = zfsvfs_create(osname, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; @@ -1026,7 +1106,6 @@ zfs_domount(vfs_t *vfsp, char *osname) vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; - /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a @@ -1053,6 +1132,7 @@ zfs_domount(vfs_t *vfsp, char *osname) vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; @@ -1063,10 +1143,11 @@ zfs_domount(vfs_t *vfsp, char *osname) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; - mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } @@ -1080,7 +1161,7 @@ zfs_domount(vfs_t *vfsp, char *osname) zfsctl_create(zfsvfs); out: if (error) { - dmu_objset_close(zfsvfs->z_os); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_add_32(&zfs_active_fs_count, 1); @@ -1121,9 +1202,6 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, - zfsvfs) == 0); - VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); @@ -1132,6 +1210,302 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) } } +#ifdef SECLABEL +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (EINVAL); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (EINVAL); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +/* + * zfs_check_global_label: + * Check that the hex label string is appropriate for the dataset + * being mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (EACCES); + return (rdonly ? 0 : EACCES); + } + return (EACCES); +} + +/* + * zfs_mount_label_policy: + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns: + * 0 : access allowed + * >0 : error code, such as EACCES + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (EACCES); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (EACCES); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (EACCES); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} +#endif /* SECLABEL */ + +#ifdef OPENSOLARIS_MOUNTROOT +static int +zfs_mountroot(vfs_t *vfsp, enum whymountroot why) +{ + int error = 0; + static int zfsrootdone = 0; + zfsvfs_t *zfsvfs = NULL; + znode_t *zp = NULL; + vnode_t *vp = NULL; + char *zfs_bootfs; + char *zfs_devid; + + ASSERT(vfsp); + + /* + * The filesystem that we mount as root is defined in the + * boot property "zfs-bootfs" with a format of + * "poolname/root-dataset-objnum". + */ + if (why == ROOT_INIT) { + if (zfsrootdone++) + return (EBUSY); + /* + * the process of doing a spa_load will require the + * clock to be set before we could (for example) do + * something better by looking at the timestamp on + * an uberblock, so just set it to -1. + */ + clkset(-1); + + if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { + cmn_err(CE_NOTE, "spa_get_bootfs: can not get " + "bootfs name"); + return (EINVAL); + } + zfs_devid = spa_get_bootprop("diskdevid"); + error = spa_import_rootpool(rootfs.bo_name, zfs_devid); + if (zfs_devid) + spa_free_bootprop(zfs_devid); + if (error) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "spa_import_rootpool: error %d", + error); + return (error); + } + if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", + error); + return (error); + } + + spa_free_bootprop(zfs_bootfs); + + if (error = vfs_lock(vfsp)) + return (error); + + if (error = zfs_domount(vfsp, rootfs.bo_name)) { + cmn_err(CE_NOTE, "zfs_domount: error %d", error); + goto out; + } + + zfsvfs = (zfsvfs_t *)vfsp->vfs_data; + ASSERT(zfsvfs); + if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { + cmn_err(CE_NOTE, "zfs_zget: error %d", error); + goto out; + } + + vp = ZTOV(zp); + mutex_enter(&vp->v_lock); + vp->v_flag |= VROOT; + mutex_exit(&vp->v_lock); + rootvp = vp; + + /* + * Leave rootvp held. The root file system is never unmounted. + */ + + vfs_add((struct vnode *)0, vfsp, + (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); +out: + vfs_unlock(vfsp); + return (error); + } else if (why == ROOT_REMOUNT) { + readonly_changed_cb(vfsp->vfs_data, B_FALSE); + vfsp->vfs_flag |= VFS_REMOUNT; + + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + return (zfs_register_callbacks(vfsp)); + + } else if (why == ROOT_UNMOUNT) { + zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); + (void) zfs_sync(vfsp, 0, 0); + return (0); + } + + /* + * if "why" is equal to anything else other than ROOT_INIT, + * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. + */ + return (ENOTSUP); +} +#endif /* OPENSOLARIS_MOUNTROOT */ + /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) @@ -1203,6 +1577,12 @@ zfs_mount(vfs_t *vfsp) goto out; } +#ifdef SECLABEL + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; +#endif + vfsp->vfs_flag |= MNT_NFS4ACLS; /* @@ -1291,6 +1671,25 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) return (0); } +int +zfs_vnode_lock(vnode_t *vp, int flags) +{ + int error; + + ASSERT(vp != NULL); + + /* + * Check if the file system wasn't forcibly unmounted in the meantime. + */ + error = vn_lock(vp, flags); + if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) { + VOP_UNLOCK(vp, 0); + error = ENOENT; + } + + return (error); +} + static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { @@ -1301,14 +1700,18 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) ZFS_ENTER_NOERROR(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { - *vpp = ZTOV(rootzp); - error = vn_lock(*vpp, flags); - (*vpp)->v_vflag |= VV_ROOT; + error = zfs_vnode_lock(*vpp, flags); + if (error == 0) + (*vpp)->v_vflag |= VV_ROOT; } + if (error != 0) + *vpp = NULL; return (error); } @@ -1371,7 +1774,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) - if (zp->z_dbuf) { + if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } @@ -1416,10 +1819,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) /* * Evict cached data */ - if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(zfsvfs->z_os); - } + if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os)) + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } @@ -1440,6 +1843,7 @@ zfs_umount(vfs_t *vfsp, int fflag) ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } + /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the @@ -1525,14 +1929,14 @@ zfs_umount(vfs_t *vfsp, int fflag) /* * Unset the objset user_ptr. */ - mutex_enter(&os->os->os_user_ptr_lock); + mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); - mutex_exit(&os->os->os_user_ptr_lock); + mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ - dmu_objset_close(os); + dmu_objset_disown(os, zfsvfs); } /* @@ -1572,13 +1976,13 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) VN_RELE(ZTOV(zp)); err = EINVAL; } + if (err == 0) + *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); + if (err == 0) + err = zfs_vnode_lock(*vpp, flags); if (err != 0) *vpp = NULL; - else { - *vpp = ZTOV(zp); - vn_lock(*vpp, flags); - } return (err); } @@ -1611,7 +2015,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; - int i, err; + int i, err; *vpp = NULL; @@ -1665,8 +2069,10 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) VN_HOLD(*vpp); } ZFS_EXIT(zfsvfs); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - return (0); + err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); + if (err != 0) + *vpp = NULL; + return (err); } gen_mask = -1ULL >> (64 - 8 * i); @@ -1676,7 +2082,9 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) ZFS_EXIT(zfsvfs); return (err); } - zp_gen = zp->z_phys->zp_gen & gen_mask; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { @@ -1686,12 +2094,14 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) return (EINVAL); } - ZFS_EXIT(zfsvfs); - *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread); - return (0); + ZFS_EXIT(zfsvfs); + err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; + return (err); } /* @@ -1701,17 +2111,13 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) * 'z_teardown_inactive_lock' write held. */ int -zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep) +zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); - - *modep = zfsvfs->z_os->os_mode; - if (name) - dmu_objset_name(zfsvfs->z_os, name); - dmu_objset_close(zfsvfs->z_os); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); return (0); } @@ -1720,21 +2126,49 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep) * Reopen zfsvfs_t::z_os and release VOPs. */ int -zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { int err; ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, + &zfsvfs->z_os); if (err) { zfsvfs->z_os = NULL; } else { znode_t *zp; + uint64_t sa_obj = 0; + + /* + * Make sure version hasn't changed + */ + + err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION, + &zfsvfs->z_version); + + if (err) + goto bail; + + err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj); + + if (err && zfsvfs->z_version >= ZPL_VERSION_SA) + goto bail; + + if ((err = sa_setup(zfsvfs->z_os, sa_obj, + zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) + goto bail; + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(zfsvfs->z_os, + zfs_sa_upgrade); VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + zfs_set_fuid_feature(zfsvfs); + /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let @@ -1747,17 +2181,17 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); - } +bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrw_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* - * Since we couldn't reopen zfsvfs::z_os, force - * unmount this file system. + * Since we couldn't reopen zfsvfs::z_os, or + * setup the sa framework force unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); @@ -1773,9 +2207,11 @@ zfs_freevfs(vfs_t *vfsp) #ifdef sun /* * If this is a snapshot, we have an extra VFS_HOLD on our parent - * from zfs_mount(). Release it here. + * from zfs_mount(). Release it here. If we came through + * zfs_mountroot() instead, we didn't grab an extra hold, so + * skip the VFS_RELE for rootvfs. */ - if (zfsvfs->z_issnap) + if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); #endif /* sun */ @@ -1825,17 +2261,17 @@ zfs_init(void) printf("ZFS filesystem version " ZPL_VERSION_STRING "\n"); /* - * Initialize znode cache, vnode ops, etc... + * Initialize .zfs directory structures */ - zfs_znode_init(); + zfsctl_init(); /* - * Initialize .zfs directory structures + * Initialize znode cache, vnode ops, etc... */ - zfsctl_init(); + zfs_znode_init(); /* - * Reduce number of vnode. Originally number of vnodes is calculated + * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ @@ -1871,13 +2307,23 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) if (newvers < zfsvfs->z_version) return (EINVAL); + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (ENOTSUP); + tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); @@ -1886,20 +2332,35 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (error); } - spa_history_internal_log(LOG_DS_UPGRADE, - dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT3U(error, ==, 0); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, newvers, dmu_objset_id(os)); dmu_tx_commit(tx); zfsvfs->z_version = newvers; - if (zfsvfs->z_version >= ZPL_VERSION_FUID) - zfs_set_fuid_feature(zfsvfs); + zfs_set_fuid_feature(zfsvfs); return (0); } + /* * Read a property stored within the master node. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index f2fdb7a..795a7bd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -23,6 +23,7 @@ */ /* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ #include #include @@ -47,10 +48,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -58,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -122,7 +126,7 @@ * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * - * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) + * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: @@ -154,7 +158,7 @@ * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes - * zil_commit(zilog, seq, foid); // synchronous when necessary + * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ @@ -169,7 +173,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (EPERM); @@ -177,8 +181,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) { + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (EACCES); @@ -216,8 +219,7 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); @@ -237,7 +239,7 @@ zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) int error; boolean_t hole; - file_sz = zp->z_phys->zp_size; + file_sz = zp->z_size; if (noff >= file_sz) { return (ENXIO); } @@ -370,7 +372,6 @@ zfs_unmap_page(struct sf_buf *sf) sf_buf_free(sf); } - /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: @@ -378,7 +379,6 @@ zfs_unmap_page(struct sf_buf *sf) * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ - static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) @@ -420,6 +420,71 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, } /* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain VPO_BUSY on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +static int +mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + vm_page_t pp; + int64_t start; + caddr_t va; + int len = nbytes; + int off; + int error = 0; + + ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + + VM_OBJECT_LOCK(obj); + for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY); + if (pp->valid == 0) { + vm_page_io_start(pp); + VM_OBJECT_UNLOCK(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + VM_OBJECT_LOCK(obj); + vm_page_io_finish(pp); + vm_page_lock(pp); + if (error) { + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); + } + if (error) + break; + uio->uio_resid -= bytes; + uio->uio_offset += bytes; + len -= bytes; + } + VM_OBJECT_UNLOCK(obj); + return (error); +} + +/* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * @@ -435,14 +500,11 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio) znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; - vm_page_t m; - struct sf_buf *sf; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; - uint64_t dirbytes; ASSERT(vp->v_mount != NULL); obj = vp->v_object; @@ -450,98 +512,25 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio) start = uio->uio_loffset; off = start & PAGEOFFSET; - dirbytes = 0; VM_OBJECT_LOCK(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - int bytes = MIN(PAGESIZE - off, len); + vm_page_t pp; + uint64_t bytes = MIN(PAGESIZE - off, len); -again: - if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && - vm_page_is_valid(m, off, bytes)) { - if ((m->oflags & VPO_BUSY) != 0) { - /* - * Reference the page before unlocking and - * sleeping so that the page daemon is less - * likely to reclaim it. - */ - vm_page_lock_queues(); - vm_page_flag_set(m, PG_REFERENCED); - vm_page_sleep(m, "zfsmrb"); - goto again; - } + if (pp = page_lookup(vp, start, off, bytes)) { + struct sf_buf *sf; + caddr_t va; - vm_page_busy(m); VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_read_uio(os, zp->z_id, uio, - dirbytes); - dirbytes = 0; - } - if (error == 0) - uiomove_fromphys(&m, off, bytes, uio); + va = zfs_map_page(pp, &sf); + error = uiomove(va + off, bytes, UIO_READ, uio); + zfs_unmap_page(sf); VM_OBJECT_LOCK(obj); - vm_page_wakeup(m); - } else if (uio->uio_segflg == UIO_NOCOPY) { - /* - * The code below is here to make sendfile(2) work - * correctly with ZFS. As pointed out by ups@ - * sendfile(2) should be changed to use VOP_GETPAGES(), - * but it pessimize performance of sendfile/UFS, that's - * why I handle this special case in ZFS code. - */ - KASSERT(off == 0, - ("unexpected offset in mappedread for sendfile")); - if (m != NULL && (m->oflags & VPO_BUSY) != 0) { - /* - * Reference the page before unlocking and - * sleeping so that the page daemon is less - * likely to reclaim it. - */ - vm_page_lock_queues(); - vm_page_flag_set(m, PG_REFERENCED); - vm_page_sleep(m, "zfsmrb"); - goto again; - } else if (m == NULL) { - m = vm_page_alloc(obj, OFF_TO_IDX(start), - VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL); - if (m == NULL) { - VM_OBJECT_UNLOCK(obj); - VM_WAIT; - VM_OBJECT_LOCK(obj); - goto again; - } - } - vm_page_io_start(m); + page_unlock(pp); + } else { VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_read_uio(os, zp->z_id, uio, - dirbytes); - dirbytes = 0; - } - if (error == 0) { - va = zfs_map_page(m, &sf); - error = dmu_read(os, zp->z_id, start, bytes, va, - DMU_READ_PREFETCH); - if (bytes != PAGE_SIZE) - bzero(va + bytes, PAGE_SIZE - bytes); - zfs_unmap_page(sf); - } + error = dmu_read_uio(os, zp->z_id, uio, bytes); VM_OBJECT_LOCK(obj); - vm_page_io_finish(m); - vm_page_lock(m); - if (error == 0) { - m->valid = VM_PAGE_BITS_ALL; - vm_page_activate(m); - } else - vm_page_free(m); - vm_page_unlock(m); - - if (error == 0) { - uio->uio_resid -= bytes; - uio->uio_offset += bytes; - } - } else { - dirbytes += bytes; } len -= bytes; off = 0; @@ -549,8 +538,6 @@ again: break; } VM_OBJECT_UNLOCK(obj); - if (error == 0 && dirbytes > 0) - error = dmu_read_uio(os, zp->z_id, uio, dirbytes); return (error); } @@ -584,12 +571,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ssize_t n, nbytes; int error; rl_t *rl; + xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); os = zfsvfs->z_os; - if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { + if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (EACCES); } @@ -613,7 +601,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) /* * Check for mandatory locks */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); @@ -624,8 +612,8 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) /* * If we're in FRSYNC mode, sync out this znode before reading it. */ - if (ioflag & FRSYNC) - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. @@ -636,18 +624,54 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ - if (uio->uio_loffset >= zp->z_phys->zp_size) { + if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } - ASSERT(uio->uio_loffset < zp->z_phys->zp_size); - n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + ASSERT(uio->uio_loffset < zp->z_size); + n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + +#ifdef sun + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { + int nblk; + int blksz = zp->z_blksz; + uint64_t offset = uio->uio_loffset; + + xuio = (xuio_t *)uio; + if ((ISP2(blksz))) { + nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, + blksz)) / blksz; + } else { + ASSERT(offset + n <= blksz); + nblk = 1; + } + (void) dmu_xuio_init(xuio, nblk); + + if (vn_has_cached_data(vp)) { + /* + * For simplicity, we always allocate a full buffer + * even if we only expect to read a portion of a block. + */ + while (--nblk >= 0) { + (void) dmu_xuio_add(xuio, + dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz), 0, blksz); + } + } + } +#endif /* sun */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); +#ifdef __FreeBSD__ + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(vp, nbytes, uio); + else +#endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) error = mappedread(vp, nbytes, uio); else @@ -661,7 +685,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n -= nbytes; } - out: zfs_range_unlock(rl); @@ -671,53 +694,6 @@ out: } /* - * Fault in the pages of the first n bytes specified by the uio structure. - * 1 byte in each page is touched and the uio struct is unmodified. - * Any error will exit this routine as this is only a best - * attempt to get the pages resident. This is a copy of ufs_trans_touch(). - */ -static void -zfs_prefault_write(ssize_t n, struct uio *uio) -{ - struct iovec *iov; - ulong_t cnt, incr; - caddr_t p; - - if (uio->uio_segflg != UIO_USERSPACE) - return; - - iov = uio->uio_iov; - - while (n) { - cnt = MIN(iov->iov_len, n); - if (cnt == 0) { - /* empty iov entry */ - iov++; - continue; - } - n -= cnt; - /* - * touch each page in this segment. - */ - p = iov->iov_base; - while (cnt) { - if (fubyte(p) == -1) - return; - incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* - * touch the last byte in case it straddles a page. - */ - p--; - if (fubyte(p) == -1) - return; - iov++; - } -} - -/* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. @@ -735,6 +711,7 @@ zfs_prefault_write(ssize_t n, struct uio *uio) * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ + /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) @@ -751,9 +728,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; - uint64_t pflags; int error; arc_buf_t *abuf; + iovec_t *aiov; + xuio_t *xuio = NULL; + int i_iov = 0; + int iovcnt = uio->uio_iovcnt; + iovec_t *iovp = uio->uio_iov; + int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write @@ -768,13 +753,19 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + /* * If immutable or not appending then return EPERM */ - pflags = zp->z_phys->zp_flags; - if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_phys->zp_size))) { + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (EPERM); } @@ -782,44 +773,61 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) zilog = zfsvfs->z_log; /* + * Validate file offset + */ + woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Check for mandatory locks before calling zfs_range_lock() + * in order to prevent a deadlock with locks set via fcntl(). + */ + if (MANDMODE((mode_t)zp->z_mode) && + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + +#ifdef sun + /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. + * Skip this if uio contains loaned arc_buf. */ - zfs_prefault_write(n, uio); + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(MIN(n, max_blksz), uio); +#endif /* sun */ /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* - * Range lock for a file append: - * The value for the start of range will be determined by - * zfs_range_lock() (to guarantee append semantics). - * If this write will cause the block size to increase, - * zfs_range_lock() will lock the entire file, so we must - * later reduce the range after we grow the block size. + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); + woff = rl->r_off; if (rl->r_len == UINT64_MAX) { - /* overlocked, zp_size can't change */ - woff = uio->uio_loffset = zp->z_phys->zp_size; - } else { - woff = uio->uio_loffset = rl->r_off; + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; } + uio->uio_loffset = woff; } else { - woff = uio->uio_loffset; - /* - * Validate file offset - */ - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - /* - * If we need to grow the block size then zfs_range_lock() - * will lock a wider range than we request here. - * Later after growing the block size we reduce the range. + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } @@ -833,16 +841,10 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; - /* - * Check for mandatory locks - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && - (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (error); - } - end_size = MAX(zp->z_phys->zp_size, woff + n); + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + + end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written @@ -852,31 +854,41 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) while (n > 0) { abuf = NULL; woff = uio->uio_loffset; - again: - if (zfs_usergroup_overquota(zfsvfs, - B_FALSE, zp->z_phys->zp_uid) || - zfs_usergroup_overquota(zfsvfs, - B_TRUE, zp->z_phys->zp_gid)) { + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = EDQUOT; break; } - /* - * If dmu_assign_arcbuf() is expected to execute with minimum - * overhead loan an arc buffer and copy user data to it before - * we enter a txg. This avoids holding a txg forever while we - * pagefault on a hanging NFS server mapping. - */ - if (abuf == NULL && n >= max_blksz && - woff >= zp->z_phys->zp_size && + if (xuio && abuf == NULL) { + ASSERT(i_iov < iovcnt); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (abuf == NULL && n >= max_blksz && + woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ size_t cbytes; - abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, @@ -891,8 +903,9 @@ again: * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (error == ERESTART) { @@ -931,22 +944,38 @@ again: */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - if (woff + nbytes > zp->z_phys->zp_size) + if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, - nbytes, tx); + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; - ASSERT(tx_bytes == max_blksz); - dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), + woff, abuf, tx); + } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } - if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); @@ -957,6 +986,8 @@ again: * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; @@ -974,29 +1005,41 @@ again: * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); - if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && - (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, - (zp->z_phys->zp_mode & S_ISUID) != 0 && - zp->z_phys->zp_uid == 0) != 0) { - zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); - /* - * Update time stamp. NOTE: This marks the bonus buffer as - * dirty, so we don't have to do it again for zp_size. - */ - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ - while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) - (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); + ASSERT(error == 0); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); @@ -1004,6 +1047,11 @@ again: break; ASSERT(tx_bytes == nbytes); n -= nbytes; + +#ifdef sun + if (!xuio && n > 0) + uio_prefaultpages(MIN(n, max_blksz), uio); +#endif /* sun */ } zfs_range_unlock(rl); @@ -1017,31 +1065,36 @@ again: return (error); } - if (ioflag & (FSYNC | FDSYNC)) - zil_commit(zilog, zp->z_last_itx, zp->z_id); + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void -zfs_get_done(dmu_buf_t *db, void *vzgd) +zfs_get_done(zgd_t *zgd, int error) { - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; - vnode_t *vp = ZTOV(rl->r_zp); - objset_t *os = rl->r_zp->z_zfsvfs->z_os; + znode_t *zp = zgd->zgd_private; + objset_t *os = zp->z_zfsvfs->z_os; int vfslocked; - vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_range_unlock(zgd->zgd_rl); + + vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ - VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os))); - zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + + if (error == 0 && zgd->zgd_bp) + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + kmem_free(zgd, sizeof (zgd_t)); VFS_UNLOCK_GIANT(vfslocked); } @@ -1059,20 +1112,21 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; - uint64_t off = lr->lr_offset; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; - rl_t *rl; zgd_t *zgd; - int dlen = lr->lr_length; /* length of user data */ int error = 0; - ASSERT(zio); - ASSERT(dlen != 0); + ASSERT(zio != NULL); + ASSERT(size != 0); /* * Nothing to do if the file has been removed */ - if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) + if (zfs_zget(zfsvfs, object, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { /* @@ -1084,6 +1138,10 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) return (ENOENT); } + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_zilog = zfsvfs->z_log; + zgd->zgd_private = zp; + /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the @@ -1092,17 +1150,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - rl = zfs_range_lock(zp, off, dlen, RL_READER); + zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { + if (offset >= zp->z_size) { error = ENOENT; - goto out; + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); } - VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf, - DMU_READ_NO_PREFETCH)); + ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ - uint64_t boff; /* block starting offset */ - /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated @@ -1110,80 +1167,59 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * blocksize after we get the lock in case it's changed! */ for (;;) { - if (ISP2(zp->z_blksz)) { - boff = P2ALIGN_TYPED(off, zp->z_blksz, - uint64_t); - } else { - boff = 0; - } - dlen = zp->z_blksz; - rl = zfs_range_lock(zp, boff, dlen, RL_READER); - if (zp->z_blksz == dlen) + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_rl = zfs_range_lock(zp, offset, size, + RL_READER); + if (zp->z_blksz == size) break; - zfs_range_unlock(rl); + offset += blkoff; + zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { + if (lr->lr_offset >= zp->z_size) error = ENOENT; - goto out; - } - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_rl = rl; - zgd->zgd_zilog = zfsvfs->z_log; - zgd->zgd_bp = &lr->lr_blkptr; #ifdef DEBUG if (zil_fault_io) { error = EIO; zil_fault_io = 0; - } else { - error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db); } -#else - error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db); #endif - if (error != 0) { - kmem_free(zgd, sizeof (zgd_t)); - goto out; - } + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); - ASSERT(boff == db->db_offset); - lr->lr_blkoff = off - boff; - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zfs_get_done, zgd); - ASSERT((error && error != EINPROGRESS) || - lr->lr_length <= zp->z_blksz); if (error == 0) { + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= zp->z_blksz); + /* - * dmu_sync() can compress a block of zeros to a null - * blkptr but the block size still needs to be passed - * through to replay. + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. */ - BP_SET_LSIZE(&lr->lr_blkptr, db->db_size); - zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); - } + if (error == 0) + return (0); - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zfs_get_done() callback. - */ - if (error == EINPROGRESS) { - return (0); - } else if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - error = 0; + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + error = 0; + } } - dmu_buf_rele(db, zgd); - kmem_free(zgd, sizeof (zgd_t)); } -out: - zfs_range_unlock(rl); - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + + zfs_get_done(zgd, error); + return (error); } @@ -1267,7 +1303,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, if (dvp->v_type != VDIR) { return (ENOTDIR); - } else if (zdp->z_dbuf == NULL) { + } else if (zdp->z_sa_hdl == NULL) { return (EIO); } @@ -1321,7 +1357,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, * We don't allow recursive attributes.. * Maybe someday we will. */ - if (zdp->z_phys->zp_flags & ZFS_XATTR) { + if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -1394,7 +1430,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, VOP_UNLOCK(dvp, 0); } ZFS_EXIT(zfsvfs); - error = vn_lock(*vpp, cnp->cn_lkflags); + error = zfs_vnode_lock(*vpp, cnp->cn_lkflags); if (cnp->cn_flags & ISDOTDOT) vn_lock(dvp, ltype | LK_RETRY); if (error != 0) { @@ -1466,8 +1502,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; + zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; void *vsecp = NULL; int flag = 0; @@ -1481,9 +1518,10 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, uid = ksid_getid(ksid); else uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || - IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (EINVAL); ZFS_ENTER(zfsvfs); @@ -1528,12 +1566,15 @@ top: error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); return (error); } } + if (zp == NULL) { uint64_t txtype; @@ -1542,6 +1583,8 @@ top: * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); goto out; } @@ -1549,16 +1592,20 @@ top: * We only support the creation of regular files in * extended attribute directories. */ - if ((dzp->z_phys->zp_flags & ZFS_XATTR) && + + if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); error = EINVAL; goto out; } - - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, - &acl_ids)) != 0) + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) goto out; + have_acl = B_TRUE; + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = EDQUOT; @@ -1566,36 +1613,39 @@ top: } tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); + 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } + zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dl, zp, tx, ZNEW); - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; @@ -1606,6 +1656,10 @@ top: } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + /* * A directory entry already exists for this name. */ @@ -1660,6 +1714,9 @@ out: error = specvp_check(vpp, cr); } + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (error); } @@ -1680,17 +1737,22 @@ out: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ + +uint64_t null_xattr = 0; + /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *zp, *dzp = VTOZ(dvp); - znode_t *xzp = NULL; + znode_t *xzp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; @@ -1712,6 +1774,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, } top: + xattr_obj = 0; + xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ @@ -1744,7 +1808,9 @@ top: else dnlc_remove(dvp, name); - may_delete_now = FALSE; + VI_LOCK(vp); + may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); + VI_UNLOCK(vp); /* * We may delete the znode now, or we may put it in the unlinked set; @@ -1752,27 +1818,34 @@ top: * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ + obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = - zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; + zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ - if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { - /* XXX - do we need this if we are deleting? */ - dmu_tx_hold_bonus(tx, xattr_obj); + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT3U(error, ==, 0); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } - /* are there any additional acls */ - if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && - may_delete_now) + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); @@ -1781,6 +1854,8 @@ top: if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); + if (xzp) + VN_RELE(ZTOV(xzp)); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); @@ -1803,29 +1878,45 @@ top: goto out; } - if (0 && unlinked) { + if (unlinked) { + + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); VI_LOCK(vp); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && - zp->z_phys->zp_xattr == xattr_obj && - zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; + xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == + acl_obj; VI_UNLOCK(vp); } if (delete_now) { - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT3U(error, ==, 0); - ASSERT3U(xzp->z_phys->zp_links, ==, 2); - dmu_buf_will_dirty(xzp->z_dbuf, tx); + if (xattr_obj_unlinked) { + ASSERT3U(xzp->z_links, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; - xzp->z_phys->zp_links = 0; + xzp->z_links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx); + ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); - zp->z_phys->zp_xattr = 0; /* probably unnecessary */ + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT3U(error, ==, 0); } - mutex_enter(&zp->z_lock); VI_LOCK(vp); vp->v_count--; ASSERT3U(vp->v_count, ==, 0); @@ -1833,13 +1924,14 @@ top: mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); } else if (unlinked) { + mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); + zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: @@ -1848,12 +1940,13 @@ out: zfs_dirent_unlock(dl); - if (!delete_now) { + if (!delete_now) VN_RELE(vp); - } else if (xzp) { - /* this rele is delayed to prevent nesting transactions */ + if (xzp) VN_RELE(ZTOV(xzp)); - } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -1895,7 +1988,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; + zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); @@ -1911,15 +2004,15 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| - IS_EPHEMERAL(crgetgid(cr)))) + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (EINVAL); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; - if (dzp->z_phys->zp_flags & ZFS_XATTR) { + if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -1932,37 +2025,43 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, if (flags & FIGNORECASE) zf |= ZCILOOK; - if (vap->va_mask & AT_XVATTR) + if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } + } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } /* * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. */ top: *vpp = NULL; if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL)) { + zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, - &acl_ids)) != 0) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); @@ -1979,18 +2078,23 @@ top: fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } + zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); @@ -1999,10 +2103,11 @@ top: /* * Create new node. */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); + /* * Now put new name in parent dir. */ @@ -2017,10 +2122,14 @@ top: acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); zfs_dirent_unlock(dl); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (0); } @@ -2108,8 +2217,10 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); @@ -2136,7 +2247,7 @@ top: uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); @@ -2151,6 +2262,9 @@ out: VN_RELE(vp); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (error); } @@ -2197,6 +2311,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; int local_eof; int outcount; int error; @@ -2210,6 +2325,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* * If we are not given an eof variable, * use a local one. @@ -2273,8 +2394,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); - cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); - *cookies = cooks; + *cookies = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *ncookies = ncooks; } /* @@ -2298,7 +2418,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; - off64_t *next; + off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. @@ -2311,7 +2431,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; - objnum = zp->z_phys->zp_parent; + objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); @@ -2421,6 +2541,16 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon if (prefetch) dmu_prefetch(os, objnum, 0, 0); + if (ncookies != NULL) { + if (cooks == NULL) + cooks = *cookies; + else { + *cooks++ = offset; + ncooks--; + KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); + } + } + skip_entry: /* * Move to the next entry, fill in the previous offset. @@ -2431,12 +2561,6 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon } else { offset += 1; } - - if (cooks != NULL) { - *cooks++ = offset; - ncooks--; - KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); - } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ @@ -2485,10 +2609,12 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); - ZFS_EXIT(zfsvfs); + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } return (0); } @@ -2515,26 +2641,38 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_phys_t *pzp; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t links; + uint64_t mtime[2], ctime[2], crtime[2]; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[3]; + int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - pzp = zp->z_phys; + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &crtime, 16); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ - if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && - (pzp->zp_uid != crgetuid(cr))) { + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); @@ -2548,19 +2686,18 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, */ mutex_enter(&zp->z_lock); - vap->va_type = IFTOVT(pzp->zp_mode); - vap->va_mode = pzp->zp_mode & ~S_IFMT; - zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; // vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) - links = pzp->zp_links + 1; + links = zp->z_links + 1; else - links = pzp->zp_links; + links = zp->z_links; vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ - vap->va_size = pzp->zp_size; + vap->va_size = zp->z_size; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; - vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); +// vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ @@ -2571,110 +2708,114 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = - ((pzp->zp_flags & ZFS_ARCHIVE) != 0); + ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = - ((pzp->zp_flags & ZFS_READONLY) != 0); + ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = - ((pzp->zp_flags & ZFS_SYSTEM) != 0); + ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = - ((pzp->zp_flags & ZFS_HIDDEN) != 0); + ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = - ((pzp->zp_flags & ZFS_NOUNLINK) != 0); + ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = - ((pzp->zp_flags & ZFS_APPENDONLY) != 0); + ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = - ((pzp->zp_flags & ZFS_NODUMP) != 0); + ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = - ((pzp->zp_flags & ZFS_OPAQUE) != 0); + ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && - vp->v_type == VREG && - (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - - /* - * Only VREG files have anti-virus scanstamps, so we - * won't conflict with symlinks in the bonus buffer. - */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len <= doi.doi_bonus_size) { - /* - * pzp points to the start of the - * znode_phys_t. pzp + 1 points to the - * first byte after the znode_phys_t. - */ - (void) memcpy(xoap->xoa_av_scanstamp, - pzp + 1, - sizeof (xoap->xoa_av_scanstamp)); - XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); - } + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); + uint64_t times[2]; + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)); + ZFS_TIME_DECODE(&xoap->xoa_createtime, times); XVA_SET_RTN(xvap, XAT_CREATETIME); } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } } - ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); - ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); - ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); - ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, crtime); mutex_exit(&zp->z_lock); - dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ @@ -2713,7 +2854,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); - znode_phys_t *pzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; @@ -2725,15 +2865,19 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; - int err; + int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; - zfs_acl_t *aclp = NULL; + zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - boolean_t fuid_dirtied = B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; if (mask == 0) return (0); @@ -2744,7 +2888,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - pzp = zp->z_phys; zilog = zfsvfs->z_log; /* @@ -2781,14 +2924,14 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, /* * Immutable files can only alter immutable bit and atime */ - if ((pzp->zp_flags & ZFS_IMMUTABLE) && + if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (EPERM); } - if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { + if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (EPERM); } @@ -2809,6 +2952,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, top: attrzp = NULL; + aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { @@ -2844,10 +2988,13 @@ top: ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); + } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); @@ -2860,7 +3007,7 @@ top: */ if (!(mask & AT_MODE)) - vap->va_mode = pzp->zp_mode; + vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of @@ -2898,7 +3045,7 @@ top: } mutex_enter(&zp->z_lock); - oldva.va_mode = pzp->zp_mode; + oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* @@ -2910,7 +3057,7 @@ top: */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != - ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); @@ -2920,7 +3067,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != - ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); @@ -2930,7 +3077,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); @@ -2940,7 +3087,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != - ((pzp->zp_flags & ZFS_NODUMP) != 0)) { + ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); @@ -2950,7 +3097,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); @@ -2962,7 +3109,7 @@ top: if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); @@ -2970,6 +3117,12 @@ top: } } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (EPERM); + } + if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { @@ -3038,79 +3191,89 @@ top: */ mask = vap->va_mask; - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - - if (mask & AT_MODE) { - uint64_t pmode = pzp->zp_mode; - - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) - goto out; - if (pzp->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL from old V0 format to new V1 */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - pzp->zp_acl.z_acl_version == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - pzp->zp_acl.z_acl_extern_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, - pzp->zp_acl.z_acl_extern_obj, 0, - aclp->z_acl_bytes); - } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } - } + if ((mask & (AT_UID | AT_GID))) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); - if (mask & (AT_UID | AT_GID)) { - if (pzp->zp_xattr) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err) - goto out; - dmu_tx_hold_bonus(tx, attrzp->z_id); + goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (new_uid != pzp->zp_uid && - zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { + if (new_uid != zp->z_uid && + zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); err = EDQUOT; - goto out; + goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); - if (new_gid != pzp->zp_gid && - zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { + if (new_gid != zp->z_gid && + zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); err = EDQUOT; - goto out; + goto out2; } } - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); + } + tx = dmu_tx_create(zfsvfs->z_os); + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + zfs_acl_chmod_setattr(zp, &aclp, new_mode); + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if ((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_NOWAIT); if (err) { if (err == ERESTART) @@ -3118,8 +3281,7 @@ top: goto out; } - dmu_buf_will_dirty(zp->z_dbuf, tx); - + count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. @@ -3128,47 +3290,108 @@ top: * updated as a side-effect of calling this function. */ + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + if (mask & AT_MODE) { - mutex_enter(&zp->z_acl_lock); - zp->z_phys->zp_mode = new_mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; - mutex_exit(&zp->z_acl_lock); } - if (attrzp) - mutex_enter(&attrzp->z_lock); - if (mask & AT_UID) { - pzp->zp_uid = new_uid; - if (attrzp) - attrzp->z_phys->zp_uid = new_uid; + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); } - if (mask & AT_GID) { - pzp->zp_gid = new_gid; - if (attrzp) - attrzp->z_phys->zp_gid = new_gid; + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); } - if (attrzp) - mutex_exit(&attrzp->z_lock); - - if (mask & AT_ATIME) - ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); - - if (mask & AT_MTIME) - ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); - /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ - if (mask & AT_SIZE) - zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); - else if (mask != 0) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime, B_TRUE); + } + } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit @@ -3200,20 +3423,10 @@ top: XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); - /* Grow the bonus buffer if necessary. */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len > doi.doi_bonus_size) - VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); - } - zfs_xvattr_set(zp, xvap); + zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) @@ -3223,11 +3436,23 @@ top: zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + if (attrzp) VN_RELE(ZTOV(attrzp)); - if (aclp) zfs_acl_free(aclp); @@ -3236,13 +3461,18 @@ out: fuidp = NULL; } - if (err) + if (err) { dmu_tx_abort(tx); - else + if (err == ERESTART) + goto top; + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); + } - if (err == ERESTART) - goto top; +out2: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); @@ -3283,7 +3513,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; - uint64_t *oidp = &zp->z_id; + uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; @@ -3305,7 +3535,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; - oidp = &zp->z_id; + oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; @@ -3323,19 +3553,20 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) zl->zl_next = *zlpp; *zlpp = zl; - if (*oidp == szp->z_id) /* We're a descendant of szp */ + if (oidp == szp->z_id) /* We're a descendant of szp */ return (EINVAL); - if (*oidp == rootid) /* We've hit the top */ + if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); + int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); if (error) return (error); zl->zl_znode = zp; } - oidp = &zp->z_phys->zp_parent; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), + &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; @@ -3415,8 +3646,7 @@ top: * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ - if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != - (sdzp->z_phys->zp_flags & ZFS_XATTR)) { + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -3517,6 +3747,11 @@ top: if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); + /* + * FreeBSD: In OpenSolaris they only check if rename source is + * ".." here, because "." is handled in their lookup. This is + * not the case for FreeBSD, so we check for "." explicitly. + */ if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); @@ -3596,14 +3831,20 @@ top: } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ - dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) - dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ - if (tzp) - dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { @@ -3634,17 +3875,39 @@ top: if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { - szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; + szp->z_pflags |= ZFS_AV_MODIFIED; - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - ASSERT(error == 0); + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT3U(error, ==, 0); - zfs_log_rename(zilog, tx, - TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), - sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); - /* Update path information for the target vnode */ - vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, ZTOV(szp), tnm, + strlen(tnm)); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } } #ifdef FREEBSD_NAMECACHE if (error == 0) { @@ -3665,10 +3928,14 @@ out: if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); + VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (error); @@ -3701,11 +3968,12 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - int len = strlen(link); + uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); @@ -3721,27 +3989,35 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, } if (flags & FIGNORECASE) zflg |= ZCILOOK; -top: - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (ENAMETOOLONG); } + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } - VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); @@ -3751,71 +4027,59 @@ top: tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } + zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } - dmu_buf_will_dirty(dzp->z_dbuf, tx); - /* * Create a new object for the symlink. - * Put the link content into bonus buffer if it will fit; - * otherwise, store it just like any other file data. + * for version 4 ZPL datsets the symlink will be an SA attribute */ - if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); - if (len != 0) - bcopy(link, zp->z_phys + 1, len); - } else { - dmu_buf_t *dbp; - - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* - * Nothing can access the znode yet so no locking needed - * for growing the znode's blocksize. - */ - zfs_grow_blocksize(zp, len, tx); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, - zp->z_id, 0, FTAG, &dbp)); - dmu_buf_will_dirty(dbp, tx); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); - ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); - dmu_buf_rele(dbp, FTAG); - } - zp->z_phys->zp_size = len; + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); - if (error == 0) { - uint64_t txtype = TX_SYMLINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - *vpp = ZTOV(zp); - } + + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); @@ -3823,6 +4087,9 @@ top: zfs_dirent_unlock(dl); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (error); } @@ -3850,29 +4117,21 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - size_t bufsz; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - bufsz = (size_t)zp->z_phys->zp_size; - if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { - error = uiomove(zp->z_phys + 1, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - } else { - dmu_buf_t *dbp; - error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - dmu_buf_rele(dbp, FTAG); - } + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); return (error); } @@ -3938,7 +4197,12 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, /* Prevent links to .zfs/shares files */ - if (szp->z_phys->zp_parent == zfsvfs->z_shares_dir) { + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (EPERM); } @@ -3957,16 +4221,14 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ - if ((szp->z_phys->zp_flags & ZFS_XATTR) != - (dzp->z_phys->zp_flags & ZFS_XATTR)) { + if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } - owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); - if (owner != crgetuid(cr) && - secpolicy_basic_link(svp, cr) != 0) { + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (EPERM); } @@ -3987,8 +4249,10 @@ top: } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); @@ -4019,52 +4283,721 @@ top: vnevent_link(svp, ct); } + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (error); } -/*ARGSUSED*/ -void -zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +#ifdef sun +/* + * zfs_null_putapage() is used when the file system has been force + * unmounted. It just drops the pages. + */ +/* ARGSUSED */ +static int +zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - if (zp->z_dbuf == NULL) { - /* - * The fs has been unmounted, or we did a - * suspend/resume and this file no longer exists. - */ - VI_LOCK(vp); - vp->v_count = 0; /* count arrives as 1 */ - VI_UNLOCK(vp); - vrecycle(vp, curthread); - rw_exit(&zfsvfs->z_teardown_inactive_lock); - return; - } - - if (zp->z_atime_dirty && zp->z_unlinked == 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } + pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); + return (0); +} + +/* + * Push a page out to disk, klustering if possible. + * + * IN: vp - file to push page to. + * pp - page to push. + * flags - additional flags. + * cr - credentials of caller. + * + * OUT: offp - start of range pushed. + * lenp - len of range pushed. + * + * RETURN: 0 if success + * error code if failure + * + * NOTE: callers must have locked the page to be pushed. On + * exit, the page (and all other pages in the kluster) must be + * unlocked. + */ +/* ARGSUSED */ +static int +zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + u_offset_t off, koff; + size_t len, klen; + int err; + + off = pp->p_offset; + len = PAGESIZE; + /* + * If our blocksize is bigger than the page size, try to kluster + * multiple pages so that we write a full block (thus avoiding + * a read-modify-write). + */ + if (off < zp->z_size && zp->z_blksz > PAGESIZE) { + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; + ASSERT(koff <= zp->z_size); + if (koff + klen > zp->z_size) + klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); + pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); + } + ASSERT3U(btop(len), ==, btopr(len)); + + /* + * Can't push pages past end-of-file. + */ + if (off >= zp->z_size) { + /* ignore all pages */ + err = 0; + goto out; + } else if (off + len > zp->z_size) { + int npages = btopr(zp->z_size - off); + page_t *trunc; + + page_list_break(&pp, &trunc, npages); + /* ignore pages past end of file */ + if (trunc) + pvn_write_done(trunc, flags); + len = zp->z_size - off; + } + + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { + err = EDQUOT; + goto out; + } +top: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err != 0) { + if (err == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz <= PAGESIZE) { + caddr_t va = zfs_map_page(pp, S_READ); + ASSERT3U(len, <=, PAGESIZE); + dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); + zfs_unmap_page(pp, va); + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); + } + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); + } + dmu_tx_commit(tx); + +out: + pvn_write_done(pp, (err ? B_ERROR : 0) | flags); + if (offp) + *offp = off; + if (lenp) + *lenp = len; + + return (err); +} + +/* + * Copy the portion of the file indicated from pages into the file. + * The pages are stored in a page list attached to the files vnode. + * + * IN: vp - vnode of file to push page data to. + * off - position in file to put data. + * len - amount of data to write. + * flags - flags to control the operation. + * cr - credentials of caller. + * ct - caller context. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp; + size_t io_len; + u_offset_t io_off; + uint_t blksz; + rl_t *rl; + int error = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * Align this request to the file block size in case we kluster. + * XXX - this can result in pretty aggresive locking, which can + * impact simultanious read/write access. One option might be + * to break up long requests (len == 0) into block-by-block + * operations to get narrower locking. + */ + blksz = zp->z_blksz; + if (ISP2(blksz)) + io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); + else + io_off = 0; + if (len > 0 && ISP2(blksz)) + io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); + else + io_len = 0; + + if (io_len == 0) { + /* + * Search the entire vp list for pages >= io_off. + */ + rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); + goto out; + } + rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); + + if (off > zp->z_size) { + /* past end of file */ + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (0); + } + + len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); + + for (off = io_off; io_off < off + len; io_off += io_len) { + if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { + pp = page_lookup(vp, io_off, + (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); + } else { + pp = page_lookup_nowait(vp, io_off, + (flags & B_FREE) ? SE_EXCL : SE_SHARED); + } + + if (pp != NULL && pvn_getdirty(pp, flags)) { + int err; + + /* + * Found a dirty page to push + */ + err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); + if (err) + error = err; + } else { + io_len = PAGESIZE; + } + } +out: + zfs_range_unlock(rl); + if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* sun */ + +/*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + VI_LOCK(vp); + ASSERT(vp->v_count <= 1); + vp->v_count = 0; + VI_UNLOCK(vp); + vrecycle(vp, curthread); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); } +#ifdef sun +/* + * Bounds-check the seek operation. + * + * IN: vp - vnode seeking within + * ooff - old file offset + * noffp - pointer to new file offset + * ct - caller context + * + * RETURN: 0 if success + * EINVAL if new offset invalid + */ +/* ARGSUSED */ +static int +zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (0); + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* + * Pre-filter the generic locking function to trap attempts to place + * a mandatory lock on a memory mapped file. + */ +static int +zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * We are following the UFS semantics with respect to mapcnt + * here: If we see that the file is mapped already, then we will + * return an error, but we don't worry about races between this + * function and zfs_map(). + */ + if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + ZFS_EXIT(zfsvfs); + return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); +} + +/* + * If we can't find a page in the cache, we will create a new page + * and fill it with file data. For efficiency, we may try to fill + * multiple pages at once (klustering) to fill up the supplied page + * list. Note that the pages to be filled are held with an exclusive + * lock to prevent access by other threads while they are being filled. + */ +static int +zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, + caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) +{ + znode_t *zp = VTOZ(vp); + page_t *pp, *cur_pp; + objset_t *os = zp->z_zfsvfs->z_os; + u_offset_t io_off, total; + size_t io_len; + int err; + + if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + /* + * We only have a single page, don't bother klustering + */ + io_off = off; + io_len = PAGESIZE; + pp = page_create_va(vp, io_off, io_len, + PG_EXCL | PG_WAIT, seg, addr); + } else { + /* + * Try to find enough pages to fill the page list + */ + pp = pvn_read_kluster(vp, off, seg, addr, &io_off, + &io_len, off, plsz, 0); + } + if (pp == NULL) { + /* + * The page already exists, nothing to do here. + */ + *pl = NULL; + return (0); + } + + /* + * Fill the pages in the kluster. + */ + cur_pp = pp; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + + ASSERT3U(io_off, ==, cur_pp->p_offset); + va = zfs_map_page(cur_pp, S_WRITE); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); + zfs_unmap_page(cur_pp, va); + if (err) { + /* On error, toss the entire kluster */ + pvn_read_done(pp, B_ERROR); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = EIO; + return (err); + } + cur_pp = cur_pp->p_next; + } + + /* + * Fill in the page list array from the kluster starting + * from the desired offset `off'. + * NOTE: the page list will always be null terminated. + */ + pvn_plist_init(pp, pl, plsz, off, io_len, rw); + ASSERT(pl == NULL || (*pl)->p_offset == off); + + return (0); +} + +/* + * Return pointers to the pages for the file region [off, off + len] + * in the pl array. If plsz is greater than len, this function may + * also return page pointers from after the specified region + * (i.e. the region [off, off + plsz]). These additional pages are + * only returned if they are already in the cache, or were created as + * part of a klustered read. + * + * IN: vp - vnode of file to get data from. + * off - position in file to get data from. + * len - amount of data to retrieve. + * plsz - length of provided page list. + * seg - segment to obtain pages for. + * addr - virtual address of fault. + * rw - mode of created pages. + * cr - credentials of caller. + * ct - caller context. + * + * OUT: protp - protection mode of created pages. + * pl - list of pages created. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, + enum seg_rw rw, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t **pl0 = pl; + int err = 0; + + /* we do our own caching, faultahead is unnecessary */ + if (pl == NULL) + return (0); + else if (len > plsz) + len = plsz; + else + len = P2ROUNDUP(len, PAGESIZE); + ASSERT(plsz >= len); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (protp) + *protp = PROT_ALL; + + /* + * Loop through the requested range [off, off + len) looking + * for pages. If we don't find a page, we will need to create + * a new page and fill it with data from the file. + */ + while (len > 0) { + if (*pl = page_lookup(vp, off, SE_SHARED)) + *(pl+1) = NULL; + else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) + goto out; + while (*pl) { + ASSERT3U((*pl)->p_offset, ==, off); + off += PAGESIZE; + addr += PAGESIZE; + if (len > 0) { + ASSERT3U(len, >=, PAGESIZE); + len -= PAGESIZE; + } + ASSERT3U(plsz, >=, PAGESIZE); + plsz -= PAGESIZE; + pl++; + } + } + + /* + * Fill out the page array with any pages already in the cache. + */ + while (plsz > 0 && + (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { + off += PAGESIZE; + plsz -= PAGESIZE; + } +out: + if (err) { + /* + * Release any pages we have previously locked. + */ + while (pl > pl0) + page_unlock(*--pl); + } else { + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + } + + *pl = NULL; + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Request a memory map for a section of a file. This code interacts + * with common code and the VM system as follows: + * + * common code calls mmap(), which ends up in smmap_common() + * + * this calls VOP_MAP(), which takes you into (say) zfs + * + * zfs_map() calls as_map(), passing segvn_create() as the callback + * + * segvn_create() creates the new segment and calls VOP_ADDMAP() + * + * zfs_addmap() updates z_mapcnt + */ +/*ARGSUSED*/ +static int +zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + segvn_crargs_t vn_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((prot & PROT_WRITE) && (zp->z_pflags & + (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((prot & (PROT_READ | PROT_EXEC)) && + (zp->z_pflags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (EACCES); + } + + if (vp->v_flag & VNOMAP) { + ZFS_EXIT(zfsvfs); + return (ENOSYS); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (ENXIO); + } + + if (vp->v_type != VREG) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + + /* + * If file is locked, disallow mapping. + */ + if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + + as_rangelock(as); + error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (error != 0) { + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); + } + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)off; + vn_a.type = flags & MAP_TYPE; + vn_a.prot = prot; + vn_a.maxprot = maxprot; + vn_a.cred = cr; + vn_a.amp = NULL; + vn_a.flags = flags & ~MAP_TYPE; + vn_a.szc = 0; + vn_a.lgrp_mem_policy_flags = 0; + + error = as_map(as, *addrp, len, segvn_create, &vn_a); + + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +static int +zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + uint64_t pages = btopr(len); + + atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); + return (0); +} + +/* + * The reason we push dirty pages as part of zfs_delmap() is so that we get a + * more accurate mtime for the associated file. Since we don't have a way of + * detecting when the data was actually modified, we have to resort to + * heuristics. If an explicit msync() is done, then we mark the mtime when the + * last page is pushed. The problem occurs when the msync() call is omitted, + * which by far the most common case: + * + * open() + * mmap() + * + * munmap() + * close() + *