diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c')
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c | 863 |
1 files changed, 662 insertions, 201 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 287de4c..e9a956c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> @@ -45,6 +46,7 @@ #include <sys/dsl_deleg.h> #include <sys/spa.h> #include <sys/zap.h> +#include <sys/sa.h> #include <sys/varargs.h> #include <sys/policy.h> #include <sys/atomic.h> @@ -55,17 +57,19 @@ #include <sys/dnlc.h> #include <sys/dmu_objset.h> #include <sys/spa_boot.h> +#include <sys/sa.h> +#include "zfs_comutil.h" struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); -int zfs_super_owner = 0; +int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); -int zfs_debug_level = 0; +int zfs_debug_level; TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, "Debug level"); @@ -74,12 +78,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); -static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD, - &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION"); -static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD, - &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); @@ -156,9 +154,8 @@ zfs_sync(vfs_t *vfsp, int waitfor) } if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, UINT64_MAX, 0); - else - txg_wait_synced(dp, 0); + zil_commit(zfsvfs->z_log, 0); + ZFS_EXIT(zfsvfs); } else { /* @@ -172,6 +169,60 @@ zfs_sync(vfs_t *vfsp, int waitfor) return (0); } +#ifndef __FreeBSD__ +static int +zfs_create_unique_device(dev_t *dev) +{ + major_t new_major; + + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + minor_t start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + if (zfs_minor >= MAXMIN32) { + /* + * If we're still using the real major + * keep out of /dev/zfs and /dev/zvol minor + * number space. If we're using a getudev()'ed + * major number, we can use all of its minors. + */ + if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) + zfs_minor = ZFS_MIN_MINOR; + else + zfs_minor = 0; + } else { + zfs_minor++; + } + *dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(*dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers for the + * current major number. Create a new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique major " + "device number."); + return (-1); + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + return (0); +} +#endif /* !__FreeBSD__ */ + static void atime_changed_cb(void *arg, uint64_t newval) { @@ -313,14 +364,6 @@ vscan_changed_cb(void *arg, uint64_t newval) } static void -acl_mode_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_mode = newval; -} - -static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; @@ -335,11 +378,11 @@ zfs_register_callbacks(vfs_t *vfsp) objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; - int readonly, do_readonly = FALSE; - int setuid, do_setuid = FALSE; - int exec, do_exec = FALSE; - int xattr, do_xattr = FALSE; - int atime, do_atime = FALSE; + int readonly, do_readonly = B_FALSE; + int setuid, do_setuid = B_FALSE; + int exec, do_exec = B_FALSE; + int xattr, do_xattr = B_FALSE; + int atime, do_atime = B_FALSE; int error = 0; ASSERT(vfsp); @@ -360,7 +403,8 @@ zfs_register_callbacks(vfs_t *vfsp) * of mount options, we stash away the current values and * restore them after we register the callbacks. */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { @@ -444,8 +488,6 @@ zfs_register_callbacks(vfs_t *vfsp) error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "aclmode", acl_mode_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "vscan", vscan_changed_cb, zfsvfs); @@ -483,7 +525,6 @@ unregister: (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); @@ -491,62 +532,53 @@ unregister: } -static void -uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, - int64_t delta, dmu_tx_t *tx) -{ - uint64_t used = 0; - char buf[32]; - int err; - uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; - - if (delta == 0) - return; - - (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); - err = zap_lookup(os, obj, buf, 8, 1, &used); - ASSERT(err == 0 || err == ENOENT); - /* no underflow/overflow */ - ASSERT(delta > 0 || used >= -delta); - ASSERT(delta < 0 || used + delta > used); - used += delta; - if (used == 0) - err = zap_remove(os, obj, buf, tx); - else - err = zap_update(os, obj, buf, 8, 1, &used, tx); - ASSERT(err == 0); -} - -static void -zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype, - void *oldbonus, void *newbonus, - uint64_t oldused, uint64_t newused, dmu_tx_t *tx) +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) { - znode_phys_t *oldznp = oldbonus; - znode_phys_t *newznp = newbonus; + znode_phys_t *znp = data; + int error = 0; - if (bonustype != DMU_OT_ZNODE) - return; + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (ENOENT); - /* We charge 512 for the dnode (if it's allocated). */ - if (oldznp->zp_gen != 0) - oldused += DNODE_SIZE; - if (newznp->zp_gen != 0) - newused += DNODE_SIZE; + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (EEXIST); - if (oldznp->zp_uid == newznp->zp_uid) { - uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx); + if (bonustype == DMU_OT_ZNODE) { + *userp = znp->zp_uid; + *groupp = znp->zp_gid; } else { - uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx); - uidacct(os, B_FALSE, newznp->zp_uid, newused, tx); - } + int hdrsize; - if (oldznp->zp_gid == newznp->zp_gid) { - uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx); - } else { - uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx); - uidacct(os, B_TRUE, newznp->zp_gid, newused, tx); + ASSERT(bonustype == DMU_OT_SA); + hdrsize = sa_hdrsize(data); + + if (hdrsize != 0) { + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + } else { + /* + * This should only happen for newly created + * files that haven't had the znode data filled + * in yet. + */ + *userp = 0; + *groupp = 0; + } } + return (error); } static void @@ -733,7 +765,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, } boolean_t -zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; @@ -756,33 +788,48 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) return (used >= quota); } +boolean_t +zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) +{ + uint64_t fuid; + uint64_t quotaobj; + + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + fuid = isgroup ? zp->z_gid : zp->z_uid; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); +} + int -zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) +zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; uint64_t zval; int i, error; + uint64_t sa_obj; - if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL)) - return (error); - if (zval) - mode |= DS_MODE_READONLY; + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); - if (error == EROFS) { - mode |= DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); - } - if (error) + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); + if (error) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); + } /* * Initialize the zfs-specific filesystem structure. * Should probably make this a kmem cache, shuffle fields, * and just bzero up to z_hold_mtx[]. */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; @@ -792,15 +839,15 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error) { goto out; - } else if (zfsvfs->z_version > ZPL_VERSION) { - (void) printf("Mismatched versions: File system " - "is version %llu on-disk format, which is " - "incompatible with this software version %lld!", - (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); + } else if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); error = ENOTSUP; goto out; } - if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) goto out; zfsvfs->z_norm = (int)zval; @@ -822,6 +869,29 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error) + return (error); + } else { + /* + * Pre SA versions file systems should never touch + * either the attribute registration or layout objects. + */ + sa_obj = 0; + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error) + goto out; + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); @@ -857,7 +927,6 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) goto out; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); @@ -867,12 +936,12 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); - *zvp = zfsvfs; + *zfvp = zfsvfs; return (0); out: - dmu_objset_close(os); - *zvp = NULL; + dmu_objset_disown(os, zfsvfs); + *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } @@ -889,15 +958,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) /* * Set the objset user_ptr to track its zfsvfs. */ - mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - if (zil_disable) { - zil_destroy(zfsvfs->z_log, B_FALSE); - zfsvfs->z_log = NULL; - } /* * If we are not mounting (ie: online recv), then we don't @@ -917,37 +982,42 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) else zfs_unlinked_drain(zfsvfs); - if (zfsvfs->z_log) { - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest - * doesn't use readonly mounts, where - * zfs_unlinked_drain() isn't called.) This is because - * ziltest causes spa_sync() to think it's committed, - * but actually it is not, so the intent log contains - * many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated - * in a yet later txg. This would write a "create - * object N" record to the intent log. Normally, this - * would be fine because the spa_sync() would have - * written out the fact that object N is free, before - * we could write the "create object N" intent log - * record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - zfsvfs->z_replay = B_TRUE; - zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); - zfsvfs->z_replay = B_FALSE; + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } @@ -974,7 +1044,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs) zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_online_recv_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); @@ -989,13 +1058,24 @@ static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { - vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int @@ -1009,7 +1089,7 @@ zfs_domount(vfs_t *vfsp, char *osname) ASSERT(vfsp); ASSERT(osname); - error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs); + error = zfsvfs_create(osname, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; @@ -1026,7 +1106,6 @@ zfs_domount(vfs_t *vfsp, char *osname) vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; - /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a @@ -1053,6 +1132,7 @@ zfs_domount(vfs_t *vfsp, char *osname) vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; @@ -1063,10 +1143,11 @@ zfs_domount(vfs_t *vfsp, char *osname) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; - mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } @@ -1080,7 +1161,7 @@ zfs_domount(vfs_t *vfsp, char *osname) zfsctl_create(zfsvfs); out: if (error) { - dmu_objset_close(zfsvfs->z_os); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_add_32(&zfs_active_fs_count, 1); @@ -1121,9 +1202,6 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, - zfsvfs) == 0); - VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); @@ -1132,6 +1210,302 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) } } +#ifdef SECLABEL +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (EINVAL); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (EINVAL); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +/* + * zfs_check_global_label: + * Check that the hex label string is appropriate for the dataset + * being mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (EACCES); + return (rdonly ? 0 : EACCES); + } + return (EACCES); +} + +/* + * zfs_mount_label_policy: + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns: + * 0 : access allowed + * >0 : error code, such as EACCES + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (EACCES); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (EACCES); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (EACCES); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} +#endif /* SECLABEL */ + +#ifdef OPENSOLARIS_MOUNTROOT +static int +zfs_mountroot(vfs_t *vfsp, enum whymountroot why) +{ + int error = 0; + static int zfsrootdone = 0; + zfsvfs_t *zfsvfs = NULL; + znode_t *zp = NULL; + vnode_t *vp = NULL; + char *zfs_bootfs; + char *zfs_devid; + + ASSERT(vfsp); + + /* + * The filesystem that we mount as root is defined in the + * boot property "zfs-bootfs" with a format of + * "poolname/root-dataset-objnum". + */ + if (why == ROOT_INIT) { + if (zfsrootdone++) + return (EBUSY); + /* + * the process of doing a spa_load will require the + * clock to be set before we could (for example) do + * something better by looking at the timestamp on + * an uberblock, so just set it to -1. + */ + clkset(-1); + + if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { + cmn_err(CE_NOTE, "spa_get_bootfs: can not get " + "bootfs name"); + return (EINVAL); + } + zfs_devid = spa_get_bootprop("diskdevid"); + error = spa_import_rootpool(rootfs.bo_name, zfs_devid); + if (zfs_devid) + spa_free_bootprop(zfs_devid); + if (error) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "spa_import_rootpool: error %d", + error); + return (error); + } + if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", + error); + return (error); + } + + spa_free_bootprop(zfs_bootfs); + + if (error = vfs_lock(vfsp)) + return (error); + + if (error = zfs_domount(vfsp, rootfs.bo_name)) { + cmn_err(CE_NOTE, "zfs_domount: error %d", error); + goto out; + } + + zfsvfs = (zfsvfs_t *)vfsp->vfs_data; + ASSERT(zfsvfs); + if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { + cmn_err(CE_NOTE, "zfs_zget: error %d", error); + goto out; + } + + vp = ZTOV(zp); + mutex_enter(&vp->v_lock); + vp->v_flag |= VROOT; + mutex_exit(&vp->v_lock); + rootvp = vp; + + /* + * Leave rootvp held. The root file system is never unmounted. + */ + + vfs_add((struct vnode *)0, vfsp, + (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); +out: + vfs_unlock(vfsp); + return (error); + } else if (why == ROOT_REMOUNT) { + readonly_changed_cb(vfsp->vfs_data, B_FALSE); + vfsp->vfs_flag |= VFS_REMOUNT; + + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + return (zfs_register_callbacks(vfsp)); + + } else if (why == ROOT_UNMOUNT) { + zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); + (void) zfs_sync(vfsp, 0, 0); + return (0); + } + + /* + * if "why" is equal to anything else other than ROOT_INIT, + * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. + */ + return (ENOTSUP); +} +#endif /* OPENSOLARIS_MOUNTROOT */ + /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) @@ -1203,6 +1577,12 @@ zfs_mount(vfs_t *vfsp) goto out; } +#ifdef SECLABEL + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; +#endif + vfsp->vfs_flag |= MNT_NFS4ACLS; /* @@ -1291,6 +1671,25 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) return (0); } +int +zfs_vnode_lock(vnode_t *vp, int flags) +{ + int error; + + ASSERT(vp != NULL); + + /* + * Check if the file system wasn't forcibly unmounted in the meantime. + */ + error = vn_lock(vp, flags); + if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) { + VOP_UNLOCK(vp, 0); + error = ENOENT; + } + + return (error); +} + static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { @@ -1301,14 +1700,18 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) ZFS_ENTER_NOERROR(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { - *vpp = ZTOV(rootzp); - error = vn_lock(*vpp, flags); - (*vpp)->v_vflag |= VV_ROOT; + error = zfs_vnode_lock(*vpp, flags); + if (error == 0) + (*vpp)->v_vflag |= VV_ROOT; } + if (error != 0) + *vpp = NULL; return (error); } @@ -1371,7 +1774,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) - if (zp->z_dbuf) { + if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } @@ -1416,10 +1819,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) /* * Evict cached data */ - if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(zfsvfs->z_os); - } + if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os)) + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } @@ -1440,6 +1843,7 @@ zfs_umount(vfs_t *vfsp, int fflag) ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } + /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the @@ -1525,14 +1929,14 @@ zfs_umount(vfs_t *vfsp, int fflag) /* * Unset the objset user_ptr. */ - mutex_enter(&os->os->os_user_ptr_lock); + mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); - mutex_exit(&os->os->os_user_ptr_lock); + mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ - dmu_objset_close(os); + dmu_objset_disown(os, zfsvfs); } /* @@ -1572,13 +1976,13 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) VN_RELE(ZTOV(zp)); err = EINVAL; } + if (err == 0) + *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); + if (err == 0) + err = zfs_vnode_lock(*vpp, flags); if (err != 0) *vpp = NULL; - else { - *vpp = ZTOV(zp); - vn_lock(*vpp, flags); - } return (err); } @@ -1611,7 +2015,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; - int i, err; + int i, err; *vpp = NULL; @@ -1665,8 +2069,10 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) VN_HOLD(*vpp); } ZFS_EXIT(zfsvfs); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - return (0); + err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); + if (err != 0) + *vpp = NULL; + return (err); } gen_mask = -1ULL >> (64 - 8 * i); @@ -1676,7 +2082,9 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) ZFS_EXIT(zfsvfs); return (err); } - zp_gen = zp->z_phys->zp_gen & gen_mask; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { @@ -1686,12 +2094,14 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) return (EINVAL); } - ZFS_EXIT(zfsvfs); - *vpp = ZTOV(zp); - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread); - return (0); + ZFS_EXIT(zfsvfs); + err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; + return (err); } /* @@ -1701,17 +2111,13 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) * 'z_teardown_inactive_lock' write held. */ int -zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep) +zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); - - *modep = zfsvfs->z_os->os_mode; - if (name) - dmu_objset_name(zfsvfs->z_os, name); - dmu_objset_close(zfsvfs->z_os); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); return (0); } @@ -1720,21 +2126,49 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep) * Reopen zfsvfs_t::z_os and release VOPs. */ int -zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { int err; ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, + &zfsvfs->z_os); if (err) { zfsvfs->z_os = NULL; } else { znode_t *zp; + uint64_t sa_obj = 0; + + /* + * Make sure version hasn't changed + */ + + err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION, + &zfsvfs->z_version); + + if (err) + goto bail; + + err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj); + + if (err && zfsvfs->z_version >= ZPL_VERSION_SA) + goto bail; + + if ((err = sa_setup(zfsvfs->z_os, sa_obj, + zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) + goto bail; + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(zfsvfs->z_os, + zfs_sa_upgrade); VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + zfs_set_fuid_feature(zfsvfs); + /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let @@ -1747,17 +2181,17 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); - } +bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrw_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* - * Since we couldn't reopen zfsvfs::z_os, force - * unmount this file system. + * Since we couldn't reopen zfsvfs::z_os, or + * setup the sa framework force unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); @@ -1773,9 +2207,11 @@ zfs_freevfs(vfs_t *vfsp) #ifdef sun /* * If this is a snapshot, we have an extra VFS_HOLD on our parent - * from zfs_mount(). Release it here. + * from zfs_mount(). Release it here. If we came through + * zfs_mountroot() instead, we didn't grab an extra hold, so + * skip the VFS_RELE for rootvfs. */ - if (zfsvfs->z_issnap) + if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); #endif /* sun */ @@ -1825,17 +2261,17 @@ zfs_init(void) printf("ZFS filesystem version " ZPL_VERSION_STRING "\n"); /* - * Initialize znode cache, vnode ops, etc... + * Initialize .zfs directory structures */ - zfs_znode_init(); + zfsctl_init(); /* - * Initialize .zfs directory structures + * Initialize znode cache, vnode ops, etc... */ - zfsctl_init(); + zfs_znode_init(); /* - * Reduce number of vnode. Originally number of vnodes is calculated + * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ @@ -1871,13 +2307,23 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) if (newvers < zfsvfs->z_version) return (EINVAL); + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (ENOTSUP); + tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); @@ -1886,20 +2332,35 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (error); } - spa_history_internal_log(LOG_DS_UPGRADE, - dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT3U(error, ==, 0); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, newvers, dmu_objset_id(os)); dmu_tx_commit(tx); zfsvfs->z_version = newvers; - if (zfsvfs->z_version >= ZPL_VERSION_FUID) - zfs_set_fuid_feature(zfsvfs); + zfs_set_fuid_feature(zfsvfs); return (0); } + /* * Read a property stored within the master node. */ |