diff options
52 files changed, 2536 insertions, 446 deletions
diff --git a/sys/compat/svr4/svr4_fcntl.c b/sys/compat/svr4/svr4_fcntl.c index 4040030..c65f345 100644 --- a/sys/compat/svr4/svr4_fcntl.c +++ b/sys/compat/svr4/svr4_fcntl.c @@ -247,6 +247,7 @@ fd_revoke(p, fd) struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; + struct mount *mp; struct vattr vattr; int error, *retval; @@ -271,8 +272,11 @@ fd_revoke(p, fd) (error = suser(p)) != 0) goto out; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); + vn_finished_write(mp); out: vrele(vp); return error; diff --git a/sys/conf/files b/sys/conf/files index 84130e1..01f088f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -906,6 +906,8 @@ ufs/ffs/ffs_balloc.c optional ffs ufs/ffs/ffs_balloc.c optional mfs ufs/ffs/ffs_inode.c optional ffs ufs/ffs/ffs_inode.c optional mfs +ufs/ffs/ffs_snapshot.c optional ffs +ufs/ffs/ffs_snapshot.c optional mfs ufs/ffs/ffs_softdep.c optional softupdates ufs/ffs/ffs_softdep_stub.c standard ufs/ffs/ffs_subr.c optional ffs diff --git a/sys/dev/vn/vn.c b/sys/dev/vn/vn.c index 88e3801..efbc437 100644 --- a/sys/dev/vn/vn.c +++ b/sys/dev/vn/vn.c @@ -276,7 +276,6 @@ vnstrategy(struct bio *bp) int unit; struct vn_softc *vn; int error; - int isvplocked = 0; unit = dkunit(bp->bio_dev); vn = bp->bio_dev->si_drv1; @@ -360,6 +359,7 @@ vnstrategy(struct bio *bp) */ struct uio auio; struct iovec aiov; + struct mount *mp; bzero(&auio, sizeof(auio)); @@ -375,18 +375,18 @@ vnstrategy(struct bio *bp) auio.uio_rw = UIO_WRITE; auio.uio_resid = bp->bio_bcount; auio.uio_procp = curproc; - if (!VOP_ISLOCKED(vn->sc_vp, NULL)) { - isvplocked = 1; + if (VOP_ISLOCKED(vn->sc_vp, NULL)) + vprint("unexpected vn driver lock", vn->sc_vp); + if (bp->bio_cmd == BIO_READ) { vn_lock(vn->sc_vp, LK_EXCLUSIVE | LK_RETRY, curproc); - } - if(bp->bio_cmd == BIO_READ) error = VOP_READ(vn->sc_vp, &auio, 0, vn->sc_cred); - else + } else { + (void) vn_start_write(vn->sc_vp, &mp, V_WAIT); + vn_lock(vn->sc_vp, LK_EXCLUSIVE | LK_RETRY, curproc); error = VOP_WRITE(vn->sc_vp, &auio, 0, vn->sc_cred); - if (isvplocked) { - VOP_UNLOCK(vn->sc_vp, 0, curproc); - isvplocked = 0; + vn_finished_write(mp); } + VOP_UNLOCK(vn->sc_vp, 0, curproc); bp->bio_resid = auio.uio_resid; if (error) { diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c index 772a94c..72c7cae 100644 --- a/sys/fs/fdescfs/fdesc_vnops.c +++ b/sys/fs/fdescfs/fdesc_vnops.c @@ -383,6 +383,8 @@ fdesc_setattr(ap) { struct filedesc *fdp = ap->a_p->p_fd; struct vattr *vap = ap->a_vap; + struct vnode *vp; + struct mount *mp; struct file *fp; unsigned fd; int error; @@ -403,8 +405,11 @@ fdesc_setattr(ap) switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: - error = VOP_SETATTR((struct vnode *) fp->f_data, ap->a_vap, - ap->a_cred, ap->a_p); + vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred, ap->a_p); + vn_finished_write(mp); break; default: diff --git a/sys/fs/fifofs/fifo_vnops.c b/sys/fs/fifofs/fifo_vnops.c index 5bd13a7..03e3e37 100644 --- a/sys/fs/fifofs/fifo_vnops.c +++ b/sys/fs/fifofs/fifo_vnops.c @@ -107,6 +107,7 @@ static struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { { &vop_open_desc, (vop_t *) fifo_open }, { &vop_pathconf_desc, (vop_t *) fifo_pathconf }, { &vop_poll_desc, (vop_t *) fifo_poll }, + { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_print_desc, (vop_t *) fifo_print }, { &vop_read_desc, (vop_t *) fifo_read }, { &vop_readdir_desc, (vop_t *) fifo_badop }, diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c index cbe52f4..baf40c3 100644 --- a/sys/fs/specfs/spec_vnops.c +++ b/sys/fs/specfs/spec_vnops.c @@ -88,6 +88,7 @@ static struct vnodeopv_entry_desc spec_vnodeop_entries[] = { { &vop_open_desc, (vop_t *) spec_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_poll_desc, (vop_t *) spec_poll }, + { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_print_desc, (vop_t *) spec_print }, { &vop_read_desc, (vop_t *) spec_read }, { &vop_readdir_desc, (vop_t *) vop_panic }, @@ -415,16 +416,29 @@ spec_strategy(ap) struct buf *bp; struct vnode *vp; struct mount *mp; + int error; bp = ap->a_bp; - if ((bp->b_iocmd == BIO_WRITE) && (LIST_FIRST(&bp->b_dep)) != NULL) - buf_start(bp); - + vp = ap->a_vp; + if ((bp->b_iocmd == BIO_WRITE)) { + if (vp->v_mount != NULL && + (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) + panic("spec_strategy: bad I/O"); + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_start(bp); + if ((vp->v_flag & VCOPYONWRITE) && + (error = VOP_COPYONWRITE(vp, bp)) != 0 && + error != EOPNOTSUPP) { + bp->b_io.bio_error = error; + bp->b_io.bio_flags |= BIO_ERROR; + biodone(&bp->b_io); + return (0); + } + } /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ - vp = ap->a_vp; if (vn_isdisk(vp, NULL) && (mp = vp->v_specmountpoint) != NULL) { if (bp->b_iocmd == BIO_WRITE) { if (bp->b_lock.lk_lockholder == LK_KERNPROC) diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c index 6b88bef..d1d6e31 100644 --- a/sys/fs/unionfs/union_subr.c +++ b/sys/fs/unionfs/union_subr.c @@ -747,6 +747,7 @@ union_copyup(un, docopy, cred, p) struct proc *p; { int error; + struct mount *mp; struct vnode *lvp, *uvp; /* @@ -759,9 +760,12 @@ union_copyup(un, docopy, cred, p) if (error) return (error); - error = union_vn_create(&uvp, un, p); - if (error) + if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0) return (error); + if ((error = union_vn_create(&uvp, un, p)) != 0) { + vn_finished_write(mp); + return (error); + } lvp = un->un_lowervp; @@ -785,6 +789,7 @@ union_copyup(un, docopy, cred, p) } VOP_UNLOCK(uvp, 0, p); + vn_finished_write(mp); union_newupper(un, uvp); KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); union_vn_close(uvp, FWRITE, cred, p); @@ -910,11 +915,15 @@ union_mkshadow(um, dvp, cnp, vpp) struct vattr va; struct proc *p = cnp->cn_proc; struct componentname cn; + struct mount *mp; - error = union_relookup(um, dvp, vpp, cnp, &cn, - cnp->cn_nameptr, cnp->cn_namelen); - if (error) + if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + if ((error = union_relookup(um, dvp, vpp, cnp, &cn, + cnp->cn_nameptr, cnp->cn_namelen)) != 0) { + vn_finished_write(mp); return (error); + } if (*vpp) { if (cn.cn_flags & HASBUF) { @@ -925,6 +934,7 @@ union_mkshadow(um, dvp, cnp, vpp) vrele(*vpp); else vput(*vpp); + vn_finished_write(mp); *vpp = NULLVP; return (EEXIST); } @@ -950,6 +960,7 @@ union_mkshadow(um, dvp, cnp, vpp) cn.cn_flags &= ~HASBUF; } /*vput(dvp);*/ + vn_finished_write(mp); return (error); } @@ -973,10 +984,15 @@ union_mkwhiteout(um, dvp, cnp, path) struct proc *p = cnp->cn_proc; struct vnode *wvp; struct componentname cn; + struct mount *mp; + if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) + return (error); error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); - if (error) + if (error) { + vn_finished_write(mp); return (error); + } if (wvp) { if (cn.cn_flags & HASBUF) { @@ -987,6 +1003,7 @@ union_mkwhiteout(um, dvp, cnp, path) vrele(wvp); else vput(wvp); + vn_finished_write(mp); return (EEXIST); } @@ -998,6 +1015,7 @@ union_mkwhiteout(um, dvp, cnp, path) zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } + vn_finished_write(mp); return (error); } diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c index 1c5ed5d..d7b95f3 100644 --- a/sys/fs/unionfs/union_vnops.c +++ b/sys/fs/unionfs/union_vnops.c @@ -93,6 +93,7 @@ static int union_print __P((struct vop_print_args *ap)); static int union_read __P((struct vop_read_args *ap)); static int union_readdir __P((struct vop_readdir_args *ap)); static int union_readlink __P((struct vop_readlink_args *ap)); +static int union_getwritemount __P((struct vop_getwritemount_args *ap)); static int union_reclaim __P((struct vop_reclaim_args *ap)); static int union_remove __P((struct vop_remove_args *ap)); static int union_rename __P((struct vop_rename_args *ap)); @@ -1681,6 +1682,20 @@ union_readlink(ap) return (error); } +static int +union_getwritemount(ap) + struct vop_getwritemount_args /* { + struct vnode *a_vp; + struct mount **a_mpp; + } */ *ap; +{ + struct vnode *vp = UPPERVP(ap->a_vp); + + if (vp == NULL) + panic("union: missing upper layer in getwritemount"); + return(VOP_GETWRITEMOUNT(vp, ap->a_mpp)); +} + /* * union_inactive: * @@ -1963,6 +1978,7 @@ static struct vnodeopv_entry_desc union_vnodeop_entries[] = { { &vop_read_desc, (vop_t *) union_read }, { &vop_readdir_desc, (vop_t *) union_readdir }, { &vop_readlink_desc, (vop_t *) union_readlink }, + { &vop_getwritemount_desc, (vop_t *) union_getwritemount }, { &vop_reclaim_desc, (vop_t *) union_reclaim }, { &vop_remove_desc, (vop_t *) union_remove }, { &vop_rename_desc, (vop_t *) union_rename }, diff --git a/sys/gnu/ext2fs/ext2_bmap.c b/sys/gnu/ext2fs/ext2_bmap.c index 9056340..ab4ac52 100644 --- a/sys/gnu/ext2fs/ext2_bmap.c +++ b/sys/gnu/ext2fs/ext2_bmap.c @@ -47,6 +47,7 @@ #include <sys/vnode.h> #include <sys/mount.h> #include <sys/resourcevar.h> +#include <sys/stat.h> #include <ufs/ufs/extattr.h> #include <ufs/ufs/quota.h> @@ -115,7 +116,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct indir a[NIADDR+1], *xap; ufs_daddr_t daddr; long metalbn; - int error, maxrun, num; + int error, num, maxrun = 0; ip = VTOI(vp); mp = vp->v_mount; @@ -127,6 +128,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) #endif if (runp) { + maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; *runp = 0; } @@ -134,7 +136,6 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) *runb = 0; } - maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; xap = ap == NULL ? a : ap; if (!nump) @@ -146,9 +147,12 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); - if (*bnp == 0) - *bnp = -1; - else if (runp) { + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); @@ -226,8 +230,13 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) if (bp) bqrelse(bp); - daddr = blkptrtodb(ump, daddr); - *bnp = daddr == 0 ? -1 : daddr; + *bnp = blkptrtodb(ump, daddr); + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } return (0); } diff --git a/sys/gnu/ext2fs/inode.h b/sys/gnu/ext2fs/inode.h index 83960b0..6417a10 100644 --- a/sys/gnu/ext2fs/inode.h +++ b/sys/gnu/ext2fs/inode.h @@ -84,6 +84,7 @@ struct inode { struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ u_quad_t i_modrev; /* Revision level for NFS lease. */ struct lockf *i_lockf;/* Head of byte-level lock list. */ + struct inode *i_copyonwrite; /* copy-on-write list */ /* * Side effects; used during directory lookup. */ diff --git a/sys/gnu/fs/ext2fs/ext2_bmap.c b/sys/gnu/fs/ext2fs/ext2_bmap.c index 9056340..ab4ac52 100644 --- a/sys/gnu/fs/ext2fs/ext2_bmap.c +++ b/sys/gnu/fs/ext2fs/ext2_bmap.c @@ -47,6 +47,7 @@ #include <sys/vnode.h> #include <sys/mount.h> #include <sys/resourcevar.h> +#include <sys/stat.h> #include <ufs/ufs/extattr.h> #include <ufs/ufs/quota.h> @@ -115,7 +116,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct indir a[NIADDR+1], *xap; ufs_daddr_t daddr; long metalbn; - int error, maxrun, num; + int error, num, maxrun = 0; ip = VTOI(vp); mp = vp->v_mount; @@ -127,6 +128,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) #endif if (runp) { + maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; *runp = 0; } @@ -134,7 +136,6 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) *runb = 0; } - maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; xap = ap == NULL ? a : ap; if (!nump) @@ -146,9 +147,12 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); - if (*bnp == 0) - *bnp = -1; - else if (runp) { + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); @@ -226,8 +230,13 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) if (bp) bqrelse(bp); - daddr = blkptrtodb(ump, daddr); - *bnp = daddr == 0 ? -1 : daddr; + *bnp = blkptrtodb(ump, daddr); + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } return (0); } diff --git a/sys/gnu/fs/ext2fs/inode.h b/sys/gnu/fs/ext2fs/inode.h index 83960b0..6417a10 100644 --- a/sys/gnu/fs/ext2fs/inode.h +++ b/sys/gnu/fs/ext2fs/inode.h @@ -84,6 +84,7 @@ struct inode { struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ u_quad_t i_modrev; /* Revision level for NFS lease. */ struct lockf *i_lockf;/* Head of byte-level lock list. */ + struct inode *i_copyonwrite; /* copy-on-write list */ /* * Side effects; used during directory lookup. */ diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c index d914fc2..b0530f9 100644 --- a/sys/kern/kern_ktrace.c +++ b/sys/kern/kern_ktrace.c @@ -457,7 +457,8 @@ ktrwrite(vp, kth, uio) { struct uio auio; struct iovec aiov[2]; - register struct proc *p = curproc; /* XXX */ + struct proc *p = curproc; /* XXX */ + struct mount *mp; int error; if (vp == NULL) @@ -479,6 +480,7 @@ ktrwrite(vp, kth, uio) if (uio != NULL) kth->ktr_len += uio->uio_resid; } + vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void)VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, p->p_ucred); @@ -487,6 +489,7 @@ ktrwrite(vp, kth, uio) error = VOP_WRITE(vp, uio, IO_UNIT | IO_APPEND, p->p_ucred); } VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); if (!error) return; /* diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index e96f471..2d87b63 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -1599,6 +1599,7 @@ coredump(p) struct nameidata nd; struct vattr vattr; int error, error1, flags; + struct mount *mp; char *name; /* name of corefile */ off_t limit; @@ -1619,6 +1620,7 @@ coredump(p) if (limit == 0) return 0; +restart: name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, p); flags = O_CREAT | FWRITE | O_NOFOLLOW; @@ -1628,6 +1630,14 @@ coredump(p) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; + if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { + VOP_UNLOCK(vp, 0, p); + if ((error = vn_close(vp, FWRITE, cred, p)) != 0) + return (error); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || @@ -1647,6 +1657,7 @@ coredump(p) out: VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); error1 = vn_close(vp, FWRITE, cred, p); if (error == 0) error = error1; diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c index 2d15c70..66f7a7b 100644 --- a/sys/kern/tty_tty.c +++ b/sys/kern/tty_tty.c @@ -133,13 +133,19 @@ cttywrite(dev, uio, flag) { struct proc *p = uio->uio_procp; struct vnode *ttyvp = cttyvp(uio->uio_procp); + struct mount *mp; int error; if (ttyvp == NULL) return (EIO); + mp = NULL; + if (ttyvp->v_type != VCHR && + (error = vn_start_write(ttyvp, &mp, V_WAIT | PCATCH)) != 0) + return (error); vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_WRITE(ttyvp, uio, flag, NOCRED); VOP_UNLOCK(ttyvp, 0, p); + vn_finished_write(mp); return (error); } diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 0103877..a0b4072 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -538,7 +538,8 @@ unp_bind(unp, nam, p) struct proc *p; { struct sockaddr_un *soun = (struct sockaddr_un *)nam; - register struct vnode *vp; + struct vnode *vp; + struct mount *mp; struct vattr vattr; int error, namelen; struct nameidata nd; @@ -552,6 +553,7 @@ unp_bind(unp, nam, p) return EINVAL; strncpy(buf, soun->sun_path, namelen); buf[namelen] = 0; /* null-terminate the string */ +restart: NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, buf, p); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ @@ -559,14 +561,19 @@ unp_bind(unp, nam, p) if (error) return (error); vp = nd.ni_vp; - if (vp != NULL) { + if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); - vrele(vp); - return (EADDRINUSE); + if (vp != NULL) { + vrele(vp); + return (EADDRINUSE); + } + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; @@ -582,6 +589,7 @@ unp_bind(unp, nam, p) unp->unp_vnode = vp; unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (0); } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index dba2151..96fbd63 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1165,6 +1165,8 @@ brelse(struct buf * bp) BUF_UNLOCK(bp); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_ioflags &= ~BIO_ORDERED; + if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) + panic("brelse: not dirty"); splx(s); } @@ -1225,6 +1227,8 @@ bqrelse(struct buf * bp) BUF_UNLOCK(bp); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_ioflags &= ~BIO_ORDERED; + if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) + panic("bqrelse: not dirty"); splx(s); } @@ -1420,7 +1424,7 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) int isspecial; static int flushingbufs; - if (curproc && (curproc->p_flag & P_BUFEXHAUST) == 0) + if (curproc && (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0) isspecial = 0; else isspecial = 1; diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index f478aa2..00f9beb 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -500,6 +500,21 @@ vop_noislocked(ap) return (lockstatus(vp->v_vnlock, ap->a_p)); } +/* + * Return our mount point, as we will take charge of the writes. + */ +int +vop_stdgetwritemount(ap) + struct vop_getwritemount_args /* { + struct vnode *a_vp; + struct mount **a_mpp; + } */ *ap; +{ + + *(ap->a_mpp) = ap->a_vp->v_mount; + return (0); +} + /* * vfs default ops * used to fill the vfs fucntion table to get reasonable default return values. diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 6483660..0e5ec3f 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -453,6 +453,7 @@ getnewvnode(tag, mp, vops, vpp) int s, count; struct proc *p = curproc; /* XXX */ struct vnode *vp = NULL; + struct mount *vnmp; vm_object_t object; /* @@ -491,7 +492,14 @@ getnewvnode(tag, mp, vops, vpp) vp = NULL; continue; } - break; + /* + * Skip over it if its filesystem is being suspended. + */ + if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) + break; + simple_unlock(&vp->v_interlock); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + vp = NULL; } if (vp) { vp->v_flag |= VDOOMED; @@ -504,6 +512,7 @@ getnewvnode(tag, mp, vops, vpp) } else { simple_unlock(&vp->v_interlock); } + vn_finished_write(vnmp); #ifdef INVARIANTS { @@ -515,6 +524,8 @@ getnewvnode(tag, mp, vops, vpp) if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); + if (vp->v_writecount != 0) + panic("Non-zero write count"); } #endif vp->v_flag = 0; @@ -523,7 +534,6 @@ getnewvnode(tag, mp, vops, vpp) vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; - vp->v_writecount = 0; /* XXX */ } else { simple_unlock(&vnode_free_list_slock); vp = (struct vnode *) zalloc(vnode_zone); @@ -946,6 +956,7 @@ sched_sync(void) { struct synclist *slp; struct vnode *vp; + struct mount *mp; long starttime; int s; struct proc *p = updateproc; @@ -970,10 +981,12 @@ sched_sync(void) splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { - if (VOP_ISLOCKED(vp, NULL) == 0) { + if (VOP_ISLOCKED(vp, NULL) == 0 && + vn_start_write(vp, &mp, V_NOWAIT) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); } s = splbio(); if (LIST_FIRST(slp) == vp) { @@ -1386,6 +1399,7 @@ vrele(vp) struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); + KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); simple_lock(&vp->v_interlock); @@ -1427,6 +1441,7 @@ vput(vp) struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); + KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); simple_lock(&vp->v_interlock); @@ -1632,6 +1647,8 @@ vclean(vp, flags, p) * If the flush fails, just toss the buffers. */ if (flags & DOCLOSE) { + if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) + (void) vn_write_suspend_wait(vp, V_WAIT); if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) vinvalbuf(vp, 0, NOCRED, p, 0, 0); } @@ -2785,12 +2802,18 @@ sync_fsync(ap) simple_unlock(&mountlist_slock); return (0); } + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { + vfs_unbusy(mp, p); + simple_unlock(&mountlist_slock); + return (0); + } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; + vn_finished_write(mp); vfs_unbusy(mp, p); return (0); } diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c index 65a297ca..404114a 100644 --- a/sys/kern/vfs_extattr.c +++ b/sys/kern/vfs_extattr.c @@ -164,8 +164,8 @@ mount(p, uap) vput(vp); return (EOPNOTSUPP); /* Needs translation */ } - mp->mnt_flag |= - SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + mp->mnt_flag |= SCARG(uap, flags) & + (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); /* * Only root, or the user that did the original mount is * permitted to update it. @@ -303,7 +303,8 @@ update: vrele(vp); if (mp->mnt_kern_flag & MNTK_WANTRDWR) mp->mnt_flag &= ~MNT_RDONLY; - mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE); + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); mp->mnt_kern_flag &=~ MNTK_WANTRDWR; if (error) { mp->mnt_flag = flag; @@ -458,7 +459,7 @@ unmount(p, uap) */ int dounmount(mp, flags, p) - register struct mount *mp; + struct mount *mp; int flags; struct proc *p; { @@ -469,6 +470,7 @@ dounmount(mp, flags, p) simple_lock(&mountlist_slock); mp->mnt_kern_flag |= MNTK_UNMOUNT; lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p); + vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); @@ -481,8 +483,10 @@ dounmount(mp, flags, p) vrele(mp->mnt_syncer); if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) || - (flags & MNT_FORCE)) + (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, p); + } + vn_finished_write(mp); simple_lock(&mountlist_slock); if (error) { if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) @@ -530,7 +534,7 @@ sync(p, uap) struct proc *p; struct sync_args *uap; { - register struct mount *mp, *nmp; + struct mount *mp, *nmp; int asyncflag; simple_lock(&mountlist_slock); @@ -539,13 +543,15 @@ sync(p, uap) nmp = TAILQ_NEXT(mp, mnt_list); continue; } - if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if ((mp->mnt_flag & MNT_RDONLY) == 0 && + vn_start_write(NULL, &mp, V_NOWAIT) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, - ((p != NULL) ? p->p_ucred : NOCRED), p); + ((p != NULL) ? p->p_ucred : NOCRED), p); mp->mnt_flag |= asyncflag; + vn_finished_write(mp); } simple_lock(&mountlist_slock); nmp = TAILQ_NEXT(mp, mnt_list); @@ -593,7 +599,7 @@ quotactl(p, uap) syscallarg(caddr_t) arg; } */ *uap; { - register struct mount *mp; + struct mount *mp; int error; struct nameidata nd; @@ -602,11 +608,15 @@ quotactl(p, uap) NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); - mp = nd.ni_vp->v_mount; NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); vrele(nd.ni_vp); - return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), - SCARG(uap, arg), p)); + if (error) + return (error); + error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), p); + vn_finished_write(mp); + return (error); } /* @@ -972,6 +982,7 @@ open(p, uap) struct file *fp; struct vnode *vp; struct vattr vat; + struct mount *mp; int cmode, flags, oflags; struct file *nfp; int type, indx, error; @@ -1029,12 +1040,15 @@ open(p, uap) fp->f_flag |= FHASLOCK; } if (flags & O_TRUNC) { + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto bad; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETATTR(vp, &vat, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); if (error) goto bad; } @@ -1101,7 +1115,8 @@ mknod(p, uap) syscallarg(int) dev; } */ *uap; { - register struct vnode *vp; + struct vnode *vp; + struct mount *mp; struct vattr vattr; int error; int whiteout = 0; @@ -1118,14 +1133,16 @@ mknod(p, uap) } if (error) return (error); +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; - if (vp != NULL) + if (vp != NULL) { + vrele(vp); error = EEXIST; - else { + } else { VATTR_NULL(&vattr); vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; vattr.va_rdev = SCARG(uap, dev); @@ -1149,6 +1166,13 @@ mknod(p, uap) break; } } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } if (!error) { VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); if (whiteout) @@ -1159,17 +1183,10 @@ mknod(p, uap) if (error == 0) vput(nd.ni_vp); } - NDFREE(&nd, NDF_ONLY_PNBUF); - vput(nd.ni_dvp); - } else { - NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (vp) - vrele(vp); } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); return (error); @@ -1193,23 +1210,29 @@ mkfifo(p, uap) syscallarg(int) mode; } */ *uap; { + struct mount *mp; struct vattr vattr; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); vrele(nd.ni_vp); + vput(nd.ni_dvp); return (EEXIST); } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; @@ -1219,6 +1242,7 @@ mkfifo(p, uap) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); + vn_finished_write(mp); return (error); } @@ -1240,7 +1264,8 @@ link(p, uap) syscallarg(char *) link; } */ *uap; { - register struct vnode *vp; + struct vnode *vp; + struct mount *mp; struct nameidata nd; int error; @@ -1250,30 +1275,29 @@ link(p, uap) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; - if (vp->v_type == VDIR) - error = EPERM; /* POSIX */ - else { - NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); - error = namei(&nd); - if (!error) { - if (nd.ni_vp != NULL) { - if (nd.ni_vp) - vrele(nd.ni_vp); - error = EEXIST; - } else { - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, - LEASE_WRITE); - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); - } - NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); + if (vp->v_type == VDIR) { + vrele(vp); + return (EPERM); /* POSIX */ + } + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) { + vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); } vrele(vp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); return (error); @@ -1297,6 +1321,7 @@ symlink(p, uap) syscallarg(char *) link; } */ *uap; { + struct mount *mp; struct vattr vattr; char *path; int error; @@ -1305,20 +1330,25 @@ symlink(p, uap) path = zalloc(namei_zone); if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0) goto out; +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); vrele(nd.ni_vp); + vput(nd.ni_dvp); error = EEXIST; goto out; } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); @@ -1327,6 +1357,7 @@ symlink(p, uap) if (error == 0) vput(nd.ni_vp); vput(nd.ni_dvp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: @@ -1346,8 +1377,10 @@ undelete(p, uap) } */ *uap; { int error; + struct mount *mp; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, SCARG(uap, path), p); @@ -1357,19 +1390,23 @@ undelete(p, uap) if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); + vput(nd.ni_dvp); return (EEXIST); } - + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); return (error); @@ -1391,18 +1428,17 @@ unlink(p, uap) syscallarg(char *) path; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { @@ -1414,18 +1450,24 @@ unlink(p, uap) if (vp->v_flag & VROOT) error = EBUSY; } - + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(vp); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (!error) { VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); } NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (vp != NULLVP) - vput(vp); + vput(nd.ni_dvp); + vput(vp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); return (error); @@ -1936,6 +1978,7 @@ setfflags(p, vp, flags) int flags; { int error; + struct mount *mp; struct vattr vattr; /* @@ -1948,12 +1991,15 @@ setfflags(p, vp, flags) ((error = suser_xxx(p->p_ucred, p, PRISON_ROOT)) != 0)) return (error); + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); vattr.va_flags = flags; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (error); } @@ -2020,14 +2066,18 @@ setfmode(p, vp, mode) int mode; { int error; + struct mount *mp; struct vattr vattr; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return error; } @@ -2125,8 +2175,11 @@ setfown(p, vp, uid, gid) gid_t gid; { int error; + struct mount *mp; struct vattr vattr; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); @@ -2134,6 +2187,7 @@ setfown(p, vp, uid, gid) vattr.va_gid = gid; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return error; } @@ -2259,8 +2313,11 @@ setutimes(p, vp, ts, nullflag) int nullflag; { int error; + struct mount *mp; struct vattr vattr; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); @@ -2270,6 +2327,7 @@ setutimes(p, vp, ts, nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return error; } @@ -2394,7 +2452,8 @@ truncate(p, uap) syscallarg(off_t) length; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; @@ -2405,6 +2464,10 @@ truncate(p, uap) if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); @@ -2417,6 +2480,7 @@ truncate(p, uap) error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); } vput(vp); + vn_finished_write(mp); return (error); } @@ -2440,6 +2504,7 @@ ftruncate(p, uap) syscallarg(off_t) length; } */ *uap; { + struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; @@ -2452,6 +2517,8 @@ ftruncate(p, uap) if ((fp->f_flag & FWRITE) == 0) return (EINVAL); vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (vp->v_type == VDIR) @@ -2462,6 +2529,7 @@ ftruncate(p, uap) error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); } VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (error); } @@ -2541,13 +2609,16 @@ fsync(p, uap) syscallarg(int) fd; } */ *uap; { - register struct vnode *vp; + struct vnode *vp; + struct mount *mp; struct file *fp; int error; if ((error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (vp->v_object) vm_object_page_clean(vp->v_object, 0, 0, 0); @@ -2558,6 +2629,7 @@ fsync(p, uap) #endif VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (error); } @@ -2580,7 +2652,8 @@ rename(p, uap) syscallarg(char *) to; } */ *uap; { - register struct vnode *tvp, *fvp, *tdvp; + struct mount *mp; + struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; @@ -2590,6 +2663,12 @@ rename(p, uap) if ((error = namei(&fromnd)) != 0) return (error); fvp = fromnd.ni_vp; + if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, UIO_USERSPACE, SCARG(uap, to), p); if (fromnd.ni_vp->v_type == VDIR) @@ -2652,6 +2731,7 @@ out: vrele(fvp); } vrele(tond.ni_startdir); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); @@ -2682,11 +2762,13 @@ mkdir(p, uap) syscallarg(int) mode; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); nd.ni_cnd.cn_flags |= WILLBEDIR; @@ -2695,13 +2777,17 @@ mkdir(p, uap) vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); vrele(vp); + vput(nd.ni_dvp); return (EEXIST); } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask; @@ -2711,6 +2797,7 @@ mkdir(p, uap) vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); return (error); @@ -2732,10 +2819,12 @@ rmdir(p, uap) syscallarg(char *) path; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), p); @@ -2756,21 +2845,32 @@ rmdir(p, uap) /* * The root of a mounted filesystem cannot be deleted. */ - if (vp->v_flag & VROOT) + if (vp->v_flag & VROOT) { error = EBUSY; - else { - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + goto out; } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); - if (vp != NULLVP) - vput(vp); + vput(vp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); return (error); @@ -3049,7 +3149,8 @@ revoke(p, uap) syscallarg(char *) path; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; @@ -3068,8 +3169,11 @@ revoke(p, uap) if (p->p_ucred->cr_uid != vattr.va_uid && (error = suser_xxx(0, p, PRISON_ROOT))) goto out; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); + vn_finished_write(mp); out: vrele(vp); return (error); @@ -3228,11 +3332,16 @@ fhopen(p, uap) } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, p); /* XXX */ + if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, p->p_ucred, p); + vn_finished_write(mp); if (error) goto bad; } @@ -3407,10 +3516,15 @@ extattrctl(p, uap) NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); - mp = nd.ni_vp->v_mount; + error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); NDFREE(&nd, 0); - return (VFS_EXTATTRCTL(mp, SCARG(uap, cmd), SCARG(uap, attrname), - SCARG(uap, arg), p)); + vrele(nd.ni_vp); + if (error) + return (error); + error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), SCARG(uap, attrname), + SCARG(uap, arg), p); + vn_finished_write(mp); + return (error); } /* @@ -3425,6 +3539,7 @@ extattr_set_file(p, uap) struct extattr_set_file_args *uap; { struct nameidata nd; + struct mount *mp; struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; char attrname[EXTATTR_MAXNAMELEN]; @@ -3434,10 +3549,11 @@ extattr_set_file(p, uap) error = copyin(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN); if (error) return (error); - NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE, SCARG(uap, path), - p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return(error); + if ((error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH)) != 0) + goto done; iovlen = uap->iovcnt * sizeof(struct iovec); if (uap->iovcnt > UIO_SMALLIOV) { if (uap->iovcnt > UIO_MAXIOV) { @@ -3477,6 +3593,8 @@ done: if (needfree) FREE(needfree, M_IOV); NDFREE(&nd, 0); + vrele(nd.ni_vp); + vn_finished_write(mp); return (error); } @@ -3508,6 +3626,7 @@ extattr_get_file(p, uap) if (uap->iovcnt > UIO_SMALLIOV) { if (uap->iovcnt > UIO_MAXIOV) { NDFREE(&nd, 0); + vrele(nd.ni_vp); return (EINVAL); } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); @@ -3545,6 +3664,7 @@ done: if (needfree) FREE(needfree, M_IOV); NDFREE(&nd, 0); + vrele(nd.ni_vp); return(error); } @@ -3557,6 +3677,7 @@ extattr_delete_file(p, uap) struct proc *p; struct extattr_delete_file_args *uap; { + struct mount *mp; struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; @@ -3564,12 +3685,17 @@ extattr_delete_file(p, uap) error = copyin(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN); if (error) return(error); - NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE, SCARG(uap, path), - p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return(error); + if ((error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(nd.ni_vp); + return (error); + } error = VOP_SETEXTATTR(nd.ni_vp, attrname, NULL, p->p_cred->pc_ucred, p); NDFREE(&nd, 0); + vrele(nd.ni_vp); + vn_finished_write(mp); return(error); } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 6483660..0e5ec3f 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -453,6 +453,7 @@ getnewvnode(tag, mp, vops, vpp) int s, count; struct proc *p = curproc; /* XXX */ struct vnode *vp = NULL; + struct mount *vnmp; vm_object_t object; /* @@ -491,7 +492,14 @@ getnewvnode(tag, mp, vops, vpp) vp = NULL; continue; } - break; + /* + * Skip over it if its filesystem is being suspended. + */ + if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) + break; + simple_unlock(&vp->v_interlock); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + vp = NULL; } if (vp) { vp->v_flag |= VDOOMED; @@ -504,6 +512,7 @@ getnewvnode(tag, mp, vops, vpp) } else { simple_unlock(&vp->v_interlock); } + vn_finished_write(vnmp); #ifdef INVARIANTS { @@ -515,6 +524,8 @@ getnewvnode(tag, mp, vops, vpp) if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); + if (vp->v_writecount != 0) + panic("Non-zero write count"); } #endif vp->v_flag = 0; @@ -523,7 +534,6 @@ getnewvnode(tag, mp, vops, vpp) vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; - vp->v_writecount = 0; /* XXX */ } else { simple_unlock(&vnode_free_list_slock); vp = (struct vnode *) zalloc(vnode_zone); @@ -946,6 +956,7 @@ sched_sync(void) { struct synclist *slp; struct vnode *vp; + struct mount *mp; long starttime; int s; struct proc *p = updateproc; @@ -970,10 +981,12 @@ sched_sync(void) splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { - if (VOP_ISLOCKED(vp, NULL) == 0) { + if (VOP_ISLOCKED(vp, NULL) == 0 && + vn_start_write(vp, &mp, V_NOWAIT) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); } s = splbio(); if (LIST_FIRST(slp) == vp) { @@ -1386,6 +1399,7 @@ vrele(vp) struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); + KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); simple_lock(&vp->v_interlock); @@ -1427,6 +1441,7 @@ vput(vp) struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); + KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); simple_lock(&vp->v_interlock); @@ -1632,6 +1647,8 @@ vclean(vp, flags, p) * If the flush fails, just toss the buffers. */ if (flags & DOCLOSE) { + if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) + (void) vn_write_suspend_wait(vp, V_WAIT); if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) vinvalbuf(vp, 0, NOCRED, p, 0, 0); } @@ -2785,12 +2802,18 @@ sync_fsync(ap) simple_unlock(&mountlist_slock); return (0); } + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { + vfs_unbusy(mp, p); + simple_unlock(&mountlist_slock); + return (0); + } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; + vn_finished_write(mp); vfs_unbusy(mp, p); return (0); } diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 65a297ca..404114a 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -164,8 +164,8 @@ mount(p, uap) vput(vp); return (EOPNOTSUPP); /* Needs translation */ } - mp->mnt_flag |= - SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + mp->mnt_flag |= SCARG(uap, flags) & + (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT); /* * Only root, or the user that did the original mount is * permitted to update it. @@ -303,7 +303,8 @@ update: vrele(vp); if (mp->mnt_kern_flag & MNTK_WANTRDWR) mp->mnt_flag &= ~MNT_RDONLY; - mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE); + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); mp->mnt_kern_flag &=~ MNTK_WANTRDWR; if (error) { mp->mnt_flag = flag; @@ -458,7 +459,7 @@ unmount(p, uap) */ int dounmount(mp, flags, p) - register struct mount *mp; + struct mount *mp; int flags; struct proc *p; { @@ -469,6 +470,7 @@ dounmount(mp, flags, p) simple_lock(&mountlist_slock); mp->mnt_kern_flag |= MNTK_UNMOUNT; lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p); + vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); @@ -481,8 +483,10 @@ dounmount(mp, flags, p) vrele(mp->mnt_syncer); if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) || - (flags & MNT_FORCE)) + (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, p); + } + vn_finished_write(mp); simple_lock(&mountlist_slock); if (error) { if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) @@ -530,7 +534,7 @@ sync(p, uap) struct proc *p; struct sync_args *uap; { - register struct mount *mp, *nmp; + struct mount *mp, *nmp; int asyncflag; simple_lock(&mountlist_slock); @@ -539,13 +543,15 @@ sync(p, uap) nmp = TAILQ_NEXT(mp, mnt_list); continue; } - if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if ((mp->mnt_flag & MNT_RDONLY) == 0 && + vn_start_write(NULL, &mp, V_NOWAIT) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, - ((p != NULL) ? p->p_ucred : NOCRED), p); + ((p != NULL) ? p->p_ucred : NOCRED), p); mp->mnt_flag |= asyncflag; + vn_finished_write(mp); } simple_lock(&mountlist_slock); nmp = TAILQ_NEXT(mp, mnt_list); @@ -593,7 +599,7 @@ quotactl(p, uap) syscallarg(caddr_t) arg; } */ *uap; { - register struct mount *mp; + struct mount *mp; int error; struct nameidata nd; @@ -602,11 +608,15 @@ quotactl(p, uap) NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); - mp = nd.ni_vp->v_mount; NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); vrele(nd.ni_vp); - return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), - SCARG(uap, arg), p)); + if (error) + return (error); + error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), p); + vn_finished_write(mp); + return (error); } /* @@ -972,6 +982,7 @@ open(p, uap) struct file *fp; struct vnode *vp; struct vattr vat; + struct mount *mp; int cmode, flags, oflags; struct file *nfp; int type, indx, error; @@ -1029,12 +1040,15 @@ open(p, uap) fp->f_flag |= FHASLOCK; } if (flags & O_TRUNC) { + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto bad; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_SETATTR(vp, &vat, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); if (error) goto bad; } @@ -1101,7 +1115,8 @@ mknod(p, uap) syscallarg(int) dev; } */ *uap; { - register struct vnode *vp; + struct vnode *vp; + struct mount *mp; struct vattr vattr; int error; int whiteout = 0; @@ -1118,14 +1133,16 @@ mknod(p, uap) } if (error) return (error); +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; - if (vp != NULL) + if (vp != NULL) { + vrele(vp); error = EEXIST; - else { + } else { VATTR_NULL(&vattr); vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; vattr.va_rdev = SCARG(uap, dev); @@ -1149,6 +1166,13 @@ mknod(p, uap) break; } } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } if (!error) { VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); if (whiteout) @@ -1159,17 +1183,10 @@ mknod(p, uap) if (error == 0) vput(nd.ni_vp); } - NDFREE(&nd, NDF_ONLY_PNBUF); - vput(nd.ni_dvp); - } else { - NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (vp) - vrele(vp); } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); return (error); @@ -1193,23 +1210,29 @@ mkfifo(p, uap) syscallarg(int) mode; } */ *uap; { + struct mount *mp; struct vattr vattr; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); vrele(nd.ni_vp); + vput(nd.ni_dvp); return (EEXIST); } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; @@ -1219,6 +1242,7 @@ mkfifo(p, uap) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); + vn_finished_write(mp); return (error); } @@ -1240,7 +1264,8 @@ link(p, uap) syscallarg(char *) link; } */ *uap; { - register struct vnode *vp; + struct vnode *vp; + struct mount *mp; struct nameidata nd; int error; @@ -1250,30 +1275,29 @@ link(p, uap) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; - if (vp->v_type == VDIR) - error = EPERM; /* POSIX */ - else { - NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); - error = namei(&nd); - if (!error) { - if (nd.ni_vp != NULL) { - if (nd.ni_vp) - vrele(nd.ni_vp); - error = EEXIST; - } else { - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, - LEASE_WRITE); - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); - } - NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); + if (vp->v_type == VDIR) { + vrele(vp); + return (EPERM); /* POSIX */ + } + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) { + vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); } vrele(vp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); return (error); @@ -1297,6 +1321,7 @@ symlink(p, uap) syscallarg(char *) link; } */ *uap; { + struct mount *mp; struct vattr vattr; char *path; int error; @@ -1305,20 +1330,25 @@ symlink(p, uap) path = zalloc(namei_zone); if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0) goto out; +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); vrele(nd.ni_vp); + vput(nd.ni_dvp); error = EEXIST; goto out; } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); @@ -1327,6 +1357,7 @@ symlink(p, uap) if (error == 0) vput(nd.ni_vp); vput(nd.ni_dvp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: @@ -1346,8 +1377,10 @@ undelete(p, uap) } */ *uap; { int error; + struct mount *mp; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, SCARG(uap, path), p); @@ -1357,19 +1390,23 @@ undelete(p, uap) if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == nd.ni_vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); + vput(nd.ni_dvp); return (EEXIST); } - + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); return (error); @@ -1391,18 +1428,17 @@ unlink(p, uap) syscallarg(char *) path; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { @@ -1414,18 +1450,24 @@ unlink(p, uap) if (vp->v_flag & VROOT) error = EBUSY; } - + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(vp); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (!error) { VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); } NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); - if (vp != NULLVP) - vput(vp); + vput(nd.ni_dvp); + vput(vp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); return (error); @@ -1936,6 +1978,7 @@ setfflags(p, vp, flags) int flags; { int error; + struct mount *mp; struct vattr vattr; /* @@ -1948,12 +1991,15 @@ setfflags(p, vp, flags) ((error = suser_xxx(p->p_ucred, p, PRISON_ROOT)) != 0)) return (error); + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); vattr.va_flags = flags; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (error); } @@ -2020,14 +2066,18 @@ setfmode(p, vp, mode) int mode; { int error; + struct mount *mp; struct vattr vattr; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return error; } @@ -2125,8 +2175,11 @@ setfown(p, vp, uid, gid) gid_t gid; { int error; + struct mount *mp; struct vattr vattr; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); @@ -2134,6 +2187,7 @@ setfown(p, vp, uid, gid) vattr.va_gid = gid; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return error; } @@ -2259,8 +2313,11 @@ setutimes(p, vp, ts, nullflag) int nullflag; { int error; + struct mount *mp; struct vattr vattr; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); VATTR_NULL(&vattr); @@ -2270,6 +2327,7 @@ setutimes(p, vp, ts, nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return error; } @@ -2394,7 +2452,8 @@ truncate(p, uap) syscallarg(off_t) length; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; @@ -2405,6 +2464,10 @@ truncate(p, uap) if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); @@ -2417,6 +2480,7 @@ truncate(p, uap) error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); } vput(vp); + vn_finished_write(mp); return (error); } @@ -2440,6 +2504,7 @@ ftruncate(p, uap) syscallarg(off_t) length; } */ *uap; { + struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; @@ -2452,6 +2517,8 @@ ftruncate(p, uap) if ((fp->f_flag & FWRITE) == 0) return (EINVAL); vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (vp->v_type == VDIR) @@ -2462,6 +2529,7 @@ ftruncate(p, uap) error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); } VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (error); } @@ -2541,13 +2609,16 @@ fsync(p, uap) syscallarg(int) fd; } */ *uap; { - register struct vnode *vp; + struct vnode *vp; + struct mount *mp; struct file *fp; int error; if ((error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (vp->v_object) vm_object_page_clean(vp->v_object, 0, 0, 0); @@ -2558,6 +2629,7 @@ fsync(p, uap) #endif VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (error); } @@ -2580,7 +2652,8 @@ rename(p, uap) syscallarg(char *) to; } */ *uap; { - register struct vnode *tvp, *fvp, *tdvp; + struct mount *mp; + struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; @@ -2590,6 +2663,12 @@ rename(p, uap) if ((error = namei(&fromnd)) != 0) return (error); fvp = fromnd.ni_vp; + if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, UIO_USERSPACE, SCARG(uap, to), p); if (fromnd.ni_vp->v_type == VDIR) @@ -2652,6 +2731,7 @@ out: vrele(fvp); } vrele(tond.ni_startdir); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); @@ -2682,11 +2762,13 @@ mkdir(p, uap) syscallarg(int) mode; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); nd.ni_cnd.cn_flags |= WILLBEDIR; @@ -2695,13 +2777,17 @@ mkdir(p, uap) vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); - if (nd.ni_dvp == vp) - vrele(nd.ni_dvp); - else - vput(nd.ni_dvp); vrele(vp); + vput(nd.ni_dvp); return (EEXIST); } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask; @@ -2711,6 +2797,7 @@ mkdir(p, uap) vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); + vn_finished_write(mp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); return (error); @@ -2732,10 +2819,12 @@ rmdir(p, uap) syscallarg(char *) path; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; int error; struct nameidata nd; +restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), p); @@ -2756,21 +2845,32 @@ rmdir(p, uap) /* * The root of a mounted filesystem cannot be deleted. */ - if (vp->v_flag & VROOT) + if (vp->v_flag & VROOT) { error = EBUSY; - else { - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + goto out; } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); - if (vp != NULLVP) - vput(vp); + vput(vp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); return (error); @@ -3049,7 +3149,8 @@ revoke(p, uap) syscallarg(char *) path; } */ *uap; { - register struct vnode *vp; + struct mount *mp; + struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; @@ -3068,8 +3169,11 @@ revoke(p, uap) if (p->p_ucred->cr_uid != vattr.va_uid && (error = suser_xxx(0, p, PRISON_ROOT))) goto out; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); + vn_finished_write(mp); out: vrele(vp); return (error); @@ -3228,11 +3332,16 @@ fhopen(p, uap) } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, p); /* XXX */ + if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { + vrele(vp); + return (error); + } VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, p->p_ucred, p); + vn_finished_write(mp); if (error) goto bad; } @@ -3407,10 +3516,15 @@ extattrctl(p, uap) NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return (error); - mp = nd.ni_vp->v_mount; + error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); NDFREE(&nd, 0); - return (VFS_EXTATTRCTL(mp, SCARG(uap, cmd), SCARG(uap, attrname), - SCARG(uap, arg), p)); + vrele(nd.ni_vp); + if (error) + return (error); + error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), SCARG(uap, attrname), + SCARG(uap, arg), p); + vn_finished_write(mp); + return (error); } /* @@ -3425,6 +3539,7 @@ extattr_set_file(p, uap) struct extattr_set_file_args *uap; { struct nameidata nd; + struct mount *mp; struct uio auio; struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; char attrname[EXTATTR_MAXNAMELEN]; @@ -3434,10 +3549,11 @@ extattr_set_file(p, uap) error = copyin(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN); if (error) return (error); - NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE, SCARG(uap, path), - p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return(error); + if ((error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH)) != 0) + goto done; iovlen = uap->iovcnt * sizeof(struct iovec); if (uap->iovcnt > UIO_SMALLIOV) { if (uap->iovcnt > UIO_MAXIOV) { @@ -3477,6 +3593,8 @@ done: if (needfree) FREE(needfree, M_IOV); NDFREE(&nd, 0); + vrele(nd.ni_vp); + vn_finished_write(mp); return (error); } @@ -3508,6 +3626,7 @@ extattr_get_file(p, uap) if (uap->iovcnt > UIO_SMALLIOV) { if (uap->iovcnt > UIO_MAXIOV) { NDFREE(&nd, 0); + vrele(nd.ni_vp); return (EINVAL); } MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); @@ -3545,6 +3664,7 @@ done: if (needfree) FREE(needfree, M_IOV); NDFREE(&nd, 0); + vrele(nd.ni_vp); return(error); } @@ -3557,6 +3677,7 @@ extattr_delete_file(p, uap) struct proc *p; struct extattr_delete_file_args *uap; { + struct mount *mp; struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; @@ -3564,12 +3685,17 @@ extattr_delete_file(p, uap) error = copyin(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN); if (error) return(error); - NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE, SCARG(uap, path), - p); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if ((error = namei(&nd)) != 0) return(error); + if ((error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH)) != 0) { + vrele(nd.ni_vp); + return (error); + } error = VOP_SETEXTATTR(nd.ni_vp, attrname, NULL, p->p_cred->pc_ucred, p); NDFREE(&nd, 0); + vrele(nd.ni_vp); + vn_finished_write(mp); return(error); } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 0d0dc24..0708f7c 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -103,12 +103,14 @@ vn_open(ndp, flagp, cmode) int *flagp, cmode; { struct vnode *vp; + struct mount *mp; struct proc *p = ndp->ni_cnd.cn_proc; struct ucred *cred = p->p_ucred; struct vattr vat; struct vattr *vap = &vat; int mode, fmode, error; +restart: fmode = *flagp; if (fmode & O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; @@ -124,10 +126,19 @@ vn_open(ndp, flagp, cmode) vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; + if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(ndp, NDF_ONLY_PNBUF); + vput(ndp->ni_dvp); + if ((error = vn_start_write(NULL, &mp, + V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vput(ndp->ni_dvp); + vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); return (error); @@ -293,10 +304,17 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) { struct uio auio; struct iovec aiov; + struct mount *mp; int error; - if ((ioflg & IO_NODELOCKED) == 0) + if ((ioflg & IO_NODELOCKED) == 0) { + mp = NULL; + if (rw == UIO_WRITE && + vp->v_type != VCHR && vp->v_type != VBLK && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; @@ -316,8 +334,10 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) else if (auio.uio_resid && error == 0) error = EIO; - if ((ioflg & IO_NODELOCKED) == 0) + if ((ioflg & IO_NODELOCKED) == 0) { + vn_finished_write(mp); VOP_UNLOCK(vp, 0, p); + } return (error); } @@ -368,6 +388,7 @@ vn_write(fp, uio, cred, flags, p) int flags; { struct vnode *vp; + struct mount *mp; int error, ioflag; KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p", @@ -384,6 +405,10 @@ vn_write(fp, uio, cred, flags, p) if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; + mp = NULL; + if (vp->v_type != VCHR && vp->v_type != VBLK && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); VOP_LEASE(vp, p, cred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if ((flags & FOF_OFFSET) == 0) @@ -394,6 +419,7 @@ vn_write(fp, uio, cred, flags, p) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, p); + vn_finished_write(mp); return (error); } @@ -649,6 +675,140 @@ vn_closefile(fp, p) fp->f_cred, p)); } +/* + * Preparing to start a filesystem write operation. If the operation is + * permitted, then we bump the count of operations in progress and + * proceed. If a suspend request is in progress, we wait until the + * suspension is over, and then proceed. + */ +int +vn_start_write(vp, mpp, flags) + struct vnode *vp; + struct mount **mpp; + int flags; +{ + struct mount *mp; + int error; + + /* + * If a vnode is provided, get and return the mount point that + * to which it will write. + */ + if (vp != NULL) { + if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { + *mpp = NULL; + if (error != EOPNOTSUPP) + return (error); + return (0); + } + } + if ((mp = *mpp) == NULL) + return (0); + /* + * Check on status of suspension. + */ + while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { + if (flags & V_NOWAIT) + return (EWOULDBLOCK); + error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), + "suspfs", 0); + if (error) + return (error); + } + if (flags & V_XSLEEP) + return (0); + mp->mnt_writeopcount++; + return (0); +} + +/* + * Secondary suspension. Used by operations such as vop_inactive + * routines that are needed by the higher level functions. These + * are allowed to proceed until all the higher level functions have + * completed (indicated by mnt_writeopcount dropping to zero). At that + * time, these operations are halted until the suspension is over. + */ +int +vn_write_suspend_wait(vp, flags) + struct vnode *vp; + int flags; +{ + struct mount *mp; + int error; + + if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { + if (error != EOPNOTSUPP) + return (error); + return (0); + } + /* + * If we are not suspended or have not yet reached suspended + * mode, then let the operation proceed. + */ + if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) + return (0); + if (flags & V_NOWAIT) + return (EWOULDBLOCK); + /* + * Wait for the suspension to finish. + */ + return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH), + "suspfs", 0)); +} + +/* + * Filesystem write operation has completed. If we are suspending and this + * operation is the last one, notify the suspender that the suspension is + * now in effect. + */ +void +vn_finished_write(mp) + struct mount *mp; +{ + + if (mp == NULL) + return; + mp->mnt_writeopcount--; + if (mp->mnt_writeopcount < 0) + panic("vn_finished_write: neg cnt"); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && + mp->mnt_writeopcount <= 0) + wakeup(&mp->mnt_writeopcount); +} + +/* + * Request a filesystem to suspend write operations. + */ +void +vfs_write_suspend(mp) + struct mount *mp; +{ + struct proc *p = curproc; + + if (mp->mnt_kern_flag & MNTK_SUSPEND) + return; + mp->mnt_kern_flag |= MNTK_SUSPEND; + if (mp->mnt_writeopcount > 0) + (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0); + VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); + mp->mnt_kern_flag |= MNTK_SUSPENDED; +} + +/* + * Request a filesystem to resume write operations. + */ +void +vfs_write_resume(mp) + struct mount *mp; +{ + + if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) + return; + mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED); + wakeup(&mp->mnt_writeopcount); + wakeup(&mp->mnt_flag); +} + static int filt_vnattach(struct knote *kn) { diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index 479cc92..bda7e98 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -394,6 +394,22 @@ vop_strategy { }; # +#% getwritemount vp = = = +# +vop_getwritemount { + IN struct vnode *vp; + OUT struct mount **mpp; +}; + +# +#% copyonwrite vp L L L +# +vop_copyonwrite { + IN struct vnode *vp; + IN struct buf *bp; +}; + +# #% print vp = = = # vop_print { diff --git a/sys/miscfs/fdesc/fdesc_vnops.c b/sys/miscfs/fdesc/fdesc_vnops.c index 772a94c..72c7cae 100644 --- a/sys/miscfs/fdesc/fdesc_vnops.c +++ b/sys/miscfs/fdesc/fdesc_vnops.c @@ -383,6 +383,8 @@ fdesc_setattr(ap) { struct filedesc *fdp = ap->a_p->p_fd; struct vattr *vap = ap->a_vap; + struct vnode *vp; + struct mount *mp; struct file *fp; unsigned fd; int error; @@ -403,8 +405,11 @@ fdesc_setattr(ap) switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: - error = VOP_SETATTR((struct vnode *) fp->f_data, ap->a_vap, - ap->a_cred, ap->a_p); + vp = (struct vnode *)fp->f_data; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred, ap->a_p); + vn_finished_write(mp); break; default: diff --git a/sys/miscfs/fifofs/fifo_vnops.c b/sys/miscfs/fifofs/fifo_vnops.c index 5bd13a7..03e3e37 100644 --- a/sys/miscfs/fifofs/fifo_vnops.c +++ b/sys/miscfs/fifofs/fifo_vnops.c @@ -107,6 +107,7 @@ static struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { { &vop_open_desc, (vop_t *) fifo_open }, { &vop_pathconf_desc, (vop_t *) fifo_pathconf }, { &vop_poll_desc, (vop_t *) fifo_poll }, + { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_print_desc, (vop_t *) fifo_print }, { &vop_read_desc, (vop_t *) fifo_read }, { &vop_readdir_desc, (vop_t *) fifo_badop }, diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index cbe52f4..baf40c3 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -88,6 +88,7 @@ static struct vnodeopv_entry_desc spec_vnodeop_entries[] = { { &vop_open_desc, (vop_t *) spec_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_poll_desc, (vop_t *) spec_poll }, + { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_print_desc, (vop_t *) spec_print }, { &vop_read_desc, (vop_t *) spec_read }, { &vop_readdir_desc, (vop_t *) vop_panic }, @@ -415,16 +416,29 @@ spec_strategy(ap) struct buf *bp; struct vnode *vp; struct mount *mp; + int error; bp = ap->a_bp; - if ((bp->b_iocmd == BIO_WRITE) && (LIST_FIRST(&bp->b_dep)) != NULL) - buf_start(bp); - + vp = ap->a_vp; + if ((bp->b_iocmd == BIO_WRITE)) { + if (vp->v_mount != NULL && + (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) + panic("spec_strategy: bad I/O"); + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_start(bp); + if ((vp->v_flag & VCOPYONWRITE) && + (error = VOP_COPYONWRITE(vp, bp)) != 0 && + error != EOPNOTSUPP) { + bp->b_io.bio_error = error; + bp->b_io.bio_flags |= BIO_ERROR; + biodone(&bp->b_io); + return (0); + } + } /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ - vp = ap->a_vp; if (vn_isdisk(vp, NULL) && (mp = vp->v_specmountpoint) != NULL) { if (bp->b_iocmd == BIO_WRITE) { if (bp->b_lock.lk_lockholder == LK_KERNPROC) diff --git a/sys/miscfs/union/union_subr.c b/sys/miscfs/union/union_subr.c index 6b88bef..d1d6e31 100644 --- a/sys/miscfs/union/union_subr.c +++ b/sys/miscfs/union/union_subr.c @@ -747,6 +747,7 @@ union_copyup(un, docopy, cred, p) struct proc *p; { int error; + struct mount *mp; struct vnode *lvp, *uvp; /* @@ -759,9 +760,12 @@ union_copyup(un, docopy, cred, p) if (error) return (error); - error = union_vn_create(&uvp, un, p); - if (error) + if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0) return (error); + if ((error = union_vn_create(&uvp, un, p)) != 0) { + vn_finished_write(mp); + return (error); + } lvp = un->un_lowervp; @@ -785,6 +789,7 @@ union_copyup(un, docopy, cred, p) } VOP_UNLOCK(uvp, 0, p); + vn_finished_write(mp); union_newupper(un, uvp); KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); union_vn_close(uvp, FWRITE, cred, p); @@ -910,11 +915,15 @@ union_mkshadow(um, dvp, cnp, vpp) struct vattr va; struct proc *p = cnp->cn_proc; struct componentname cn; + struct mount *mp; - error = union_relookup(um, dvp, vpp, cnp, &cn, - cnp->cn_nameptr, cnp->cn_namelen); - if (error) + if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + if ((error = union_relookup(um, dvp, vpp, cnp, &cn, + cnp->cn_nameptr, cnp->cn_namelen)) != 0) { + vn_finished_write(mp); return (error); + } if (*vpp) { if (cn.cn_flags & HASBUF) { @@ -925,6 +934,7 @@ union_mkshadow(um, dvp, cnp, vpp) vrele(*vpp); else vput(*vpp); + vn_finished_write(mp); *vpp = NULLVP; return (EEXIST); } @@ -950,6 +960,7 @@ union_mkshadow(um, dvp, cnp, vpp) cn.cn_flags &= ~HASBUF; } /*vput(dvp);*/ + vn_finished_write(mp); return (error); } @@ -973,10 +984,15 @@ union_mkwhiteout(um, dvp, cnp, path) struct proc *p = cnp->cn_proc; struct vnode *wvp; struct componentname cn; + struct mount *mp; + if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) + return (error); error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); - if (error) + if (error) { + vn_finished_write(mp); return (error); + } if (wvp) { if (cn.cn_flags & HASBUF) { @@ -987,6 +1003,7 @@ union_mkwhiteout(um, dvp, cnp, path) vrele(wvp); else vput(wvp); + vn_finished_write(mp); return (EEXIST); } @@ -998,6 +1015,7 @@ union_mkwhiteout(um, dvp, cnp, path) zfree(namei_zone, cn.cn_pnbuf); cn.cn_flags &= ~HASBUF; } + vn_finished_write(mp); return (error); } diff --git a/sys/miscfs/union/union_vnops.c b/sys/miscfs/union/union_vnops.c index 1c5ed5d..d7b95f3 100644 --- a/sys/miscfs/union/union_vnops.c +++ b/sys/miscfs/union/union_vnops.c @@ -93,6 +93,7 @@ static int union_print __P((struct vop_print_args *ap)); static int union_read __P((struct vop_read_args *ap)); static int union_readdir __P((struct vop_readdir_args *ap)); static int union_readlink __P((struct vop_readlink_args *ap)); +static int union_getwritemount __P((struct vop_getwritemount_args *ap)); static int union_reclaim __P((struct vop_reclaim_args *ap)); static int union_remove __P((struct vop_remove_args *ap)); static int union_rename __P((struct vop_rename_args *ap)); @@ -1681,6 +1682,20 @@ union_readlink(ap) return (error); } +static int +union_getwritemount(ap) + struct vop_getwritemount_args /* { + struct vnode *a_vp; + struct mount **a_mpp; + } */ *ap; +{ + struct vnode *vp = UPPERVP(ap->a_vp); + + if (vp == NULL) + panic("union: missing upper layer in getwritemount"); + return(VOP_GETWRITEMOUNT(vp, ap->a_mpp)); +} + /* * union_inactive: * @@ -1963,6 +1978,7 @@ static struct vnodeopv_entry_desc union_vnodeop_entries[] = { { &vop_read_desc, (vop_t *) union_read }, { &vop_readdir_desc, (vop_t *) union_readdir }, { &vop_readlink_desc, (vop_t *) union_readlink }, + { &vop_getwritemount_desc, (vop_t *) union_getwritemount }, { &vop_reclaim_desc, (vop_t *) union_reclaim }, { &vop_remove_desc, (vop_t *) union_remove }, { &vop_rename_desc, (vop_t *) union_rename }, diff --git a/sys/nfs/nfs_serv.c b/sys/nfs/nfs_serv.c index 06ce9ed..0334f74 100644 --- a/sys/nfs/nfs_serv.c +++ b/sys/nfs/nfs_serv.c @@ -325,10 +325,18 @@ nfsrv_setattr(nfsd, slp, procp, mrq) struct mbuf *mb, *mb2, *mreq; u_quad_t frev; struct timespec guard; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; VATTR_NULL(vap); if (v3) { nfsm_srvsattr(vap); @@ -440,6 +448,7 @@ out: nfsmout: if (vp) vput(vp); + vn_finished_write(mp); return(error); } @@ -1039,6 +1048,7 @@ nfsrv_write(nfsd, slp, procp, mrq) struct uio io, *uiop = &io; off_t off; u_quad_t frev; + struct mount *mntp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); if (mrep == NULL) { @@ -1048,6 +1058,13 @@ nfsrv_write(nfsd, slp, procp, mrq) } fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mntp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mntp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mntp, V_WAIT); + vput(vp); + vp = NULL; if (v3) { nfsm_dissect(tl, u_int32_t *, 5 * NFSX_UNSIGNED); off = fxdr_hyper(tl); @@ -1205,6 +1222,7 @@ nfsrv_write(nfsd, slp, procp, mrq) nfsmout: if (vp) vput(vp); + vn_finished_write(mntp); return(error); } @@ -1241,6 +1259,7 @@ nfsrv_writegather(ndp, slp, procp, mrq) struct vnode *vp = NULL; struct uio io, *uiop = &io; u_quad_t frev, cur_usec; + struct mount *mntp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -1444,8 +1463,16 @@ loop1: mp = mp->m_next; } if (!error) { + if (vn_start_write(vp, &mntp, V_NOWAIT) != 0) { + VOP_UNLOCK(vp, 0, procp); + error = vn_start_write(NULL, &mntp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, procp); + } + } + if (!error) { error = VOP_WRITE(vp, uiop, ioflags, cred); nfsstats.srvvop_writes++; + vn_finished_write(mntp); } FREE((caddr_t)iov, M_TEMP); } @@ -1620,6 +1647,8 @@ nfsrv_create(nfsd, slp, procp, mrq) fhandle_t *fhp; u_quad_t frev, tempsize; u_char cverf[NFSX_V3CREATEVERF]; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -1629,6 +1658,12 @@ nfsrv_create(nfsd, slp, procp, mrq) fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; @@ -1869,6 +1904,7 @@ nfsmout: } if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return (error); } @@ -1901,12 +1937,20 @@ nfsrv_mknod(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; @@ -2030,6 +2074,7 @@ out: nfsm_srvpostop_attr(0, vap); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); + vn_finished_write(mp); return (0); nfsmout: if (dirp) @@ -2045,6 +2090,7 @@ nfsmout: } if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return (error); } @@ -2075,12 +2121,21 @@ nfsrv_remove(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; @@ -2137,6 +2192,7 @@ nfsmout: } if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return(error); } @@ -2170,6 +2226,8 @@ nfsrv_rename(nfsd, slp, procp, mrq) fhandle_t *ffhp, *tfhp; u_quad_t frev; uid_t saved_uid; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -2186,6 +2244,13 @@ nfsrv_rename(nfsd, slp, procp, mrq) ndclear(&tond); nfsm_srvmtofh(ffhp); + if ((mp = vfs_getvfs(&ffhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &ffhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); /* * Remember our original uid so that we can reset cr_uid before @@ -2360,6 +2425,7 @@ nfsmout: if (fromnd.ni_vp) vrele(fromnd.ni_vp); + vn_finished_write(mp); return (error); } @@ -2390,6 +2456,7 @@ nfsrv_link(nfsd, slp, procp, mrq) nfsfh_t nfh, dnfh; fhandle_t *fhp, *dfhp; u_quad_t frev; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); @@ -2397,6 +2464,13 @@ nfsrv_link(nfsd, slp, procp, mrq) fhp = &nfh.fh_generic; dfhp = &dnfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvmtofh(dfhp); nfsm_srvnamesiz(len); @@ -2475,6 +2549,7 @@ nfsmout: } if (nd.ni_vp) vrele(nd.ni_vp); + vn_finished_write(mp); return(error); } @@ -2508,12 +2583,21 @@ nfsrv_symlink(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; @@ -2651,6 +2735,7 @@ nfsmout: if (pathcp) FREE(pathcp, M_TEMP); + vn_finished_write(mp); return (error); } @@ -2685,12 +2770,21 @@ nfsrv_mkdir(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; @@ -2787,6 +2881,7 @@ nfsmout: else vrele(nd.ni_vp); } + vn_finished_write(mp); return (error); } @@ -2817,12 +2912,20 @@ nfsrv_rmdir(nfsd, slp, procp, mrq) fhandle_t *fhp; struct nameidata nd; u_quad_t frev; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = DELETE; @@ -2895,6 +2998,7 @@ nfsmout: if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return(error); } @@ -3588,6 +3692,7 @@ nfsrv_commit(nfsd, slp, procp, mrq) char *cp2; struct mbuf *mb, *mb2, *mreq; u_quad_t frev, off; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -3595,6 +3700,13 @@ nfsrv_commit(nfsd, slp, procp, mrq) #endif fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); /* @@ -3697,6 +3809,7 @@ nfsrv_commit(nfsd, slp, procp, mrq) nfsmout: if (vp) vput(vp); + vn_finished_write(mp); return(error); } @@ -4065,4 +4178,3 @@ nfsrv_access(vp, flags, cred, rdonly, p, override) return error; } #endif /* NFS_NOSERVER */ - diff --git a/sys/nfsserver/nfs_serv.c b/sys/nfsserver/nfs_serv.c index 06ce9ed..0334f74 100644 --- a/sys/nfsserver/nfs_serv.c +++ b/sys/nfsserver/nfs_serv.c @@ -325,10 +325,18 @@ nfsrv_setattr(nfsd, slp, procp, mrq) struct mbuf *mb, *mb2, *mreq; u_quad_t frev; struct timespec guard; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; VATTR_NULL(vap); if (v3) { nfsm_srvsattr(vap); @@ -440,6 +448,7 @@ out: nfsmout: if (vp) vput(vp); + vn_finished_write(mp); return(error); } @@ -1039,6 +1048,7 @@ nfsrv_write(nfsd, slp, procp, mrq) struct uio io, *uiop = &io; off_t off; u_quad_t frev; + struct mount *mntp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); if (mrep == NULL) { @@ -1048,6 +1058,13 @@ nfsrv_write(nfsd, slp, procp, mrq) } fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mntp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mntp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mntp, V_WAIT); + vput(vp); + vp = NULL; if (v3) { nfsm_dissect(tl, u_int32_t *, 5 * NFSX_UNSIGNED); off = fxdr_hyper(tl); @@ -1205,6 +1222,7 @@ nfsrv_write(nfsd, slp, procp, mrq) nfsmout: if (vp) vput(vp); + vn_finished_write(mntp); return(error); } @@ -1241,6 +1259,7 @@ nfsrv_writegather(ndp, slp, procp, mrq) struct vnode *vp = NULL; struct uio io, *uiop = &io; u_quad_t frev, cur_usec; + struct mount *mntp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -1444,8 +1463,16 @@ loop1: mp = mp->m_next; } if (!error) { + if (vn_start_write(vp, &mntp, V_NOWAIT) != 0) { + VOP_UNLOCK(vp, 0, procp); + error = vn_start_write(NULL, &mntp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, procp); + } + } + if (!error) { error = VOP_WRITE(vp, uiop, ioflags, cred); nfsstats.srvvop_writes++; + vn_finished_write(mntp); } FREE((caddr_t)iov, M_TEMP); } @@ -1620,6 +1647,8 @@ nfsrv_create(nfsd, slp, procp, mrq) fhandle_t *fhp; u_quad_t frev, tempsize; u_char cverf[NFSX_V3CREATEVERF]; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -1629,6 +1658,12 @@ nfsrv_create(nfsd, slp, procp, mrq) fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; @@ -1869,6 +1904,7 @@ nfsmout: } if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return (error); } @@ -1901,12 +1937,20 @@ nfsrv_mknod(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; @@ -2030,6 +2074,7 @@ out: nfsm_srvpostop_attr(0, vap); } nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); + vn_finished_write(mp); return (0); nfsmout: if (dirp) @@ -2045,6 +2090,7 @@ nfsmout: } if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return (error); } @@ -2075,12 +2121,21 @@ nfsrv_remove(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; @@ -2137,6 +2192,7 @@ nfsmout: } if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return(error); } @@ -2170,6 +2226,8 @@ nfsrv_rename(nfsd, slp, procp, mrq) fhandle_t *ffhp, *tfhp; u_quad_t frev; uid_t saved_uid; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -2186,6 +2244,13 @@ nfsrv_rename(nfsd, slp, procp, mrq) ndclear(&tond); nfsm_srvmtofh(ffhp); + if ((mp = vfs_getvfs(&ffhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &ffhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); /* * Remember our original uid so that we can reset cr_uid before @@ -2360,6 +2425,7 @@ nfsmout: if (fromnd.ni_vp) vrele(fromnd.ni_vp); + vn_finished_write(mp); return (error); } @@ -2390,6 +2456,7 @@ nfsrv_link(nfsd, slp, procp, mrq) nfsfh_t nfh, dnfh; fhandle_t *fhp, *dfhp; u_quad_t frev; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); @@ -2397,6 +2464,13 @@ nfsrv_link(nfsd, slp, procp, mrq) fhp = &nfh.fh_generic; dfhp = &dnfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvmtofh(dfhp); nfsm_srvnamesiz(len); @@ -2475,6 +2549,7 @@ nfsmout: } if (nd.ni_vp) vrele(nd.ni_vp); + vn_finished_write(mp); return(error); } @@ -2508,12 +2583,21 @@ nfsrv_symlink(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; @@ -2651,6 +2735,7 @@ nfsmout: if (pathcp) FREE(pathcp, M_TEMP); + vn_finished_write(mp); return (error); } @@ -2685,12 +2770,21 @@ nfsrv_mkdir(nfsd, slp, procp, mrq) nfsfh_t nfh; fhandle_t *fhp; u_quad_t frev; + struct mount *mp = NULL; + struct vnode *vp; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = CREATE; @@ -2787,6 +2881,7 @@ nfsmout: else vrele(nd.ni_vp); } + vn_finished_write(mp); return (error); } @@ -2817,12 +2912,20 @@ nfsrv_rmdir(nfsd, slp, procp, mrq) fhandle_t *fhp; struct nameidata nd; u_quad_t frev; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); ndclear(&nd); fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = DELETE; @@ -2895,6 +2998,7 @@ nfsmout: if (nd.ni_vp) vput(nd.ni_vp); + vn_finished_write(mp); return(error); } @@ -3588,6 +3692,7 @@ nfsrv_commit(nfsd, slp, procp, mrq) char *cp2; struct mbuf *mb, *mb2, *mreq; u_quad_t frev, off; + struct mount *mp = NULL; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); #ifndef nolint @@ -3595,6 +3700,13 @@ nfsrv_commit(nfsd, slp, procp, mrq) #endif fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); + if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if ((error = VFS_FHTOVP(mp, &fhp->fh_fid, &vp)) != NULL) + return (error); + (void) vn_start_write(vp, &mp, V_WAIT); + vput(vp); + vp = NULL; nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); /* @@ -3697,6 +3809,7 @@ nfsrv_commit(nfsd, slp, procp, mrq) nfsmout: if (vp) vput(vp); + vn_finished_write(mp); return(error); } @@ -4065,4 +4178,3 @@ nfsrv_access(vp, flags, cred, rdonly, p, override) return error; } #endif /* NFS_NOSERVER */ - diff --git a/sys/svr4/svr4_fcntl.c b/sys/svr4/svr4_fcntl.c index 4040030..c65f345 100644 --- a/sys/svr4/svr4_fcntl.c +++ b/sys/svr4/svr4_fcntl.c @@ -247,6 +247,7 @@ fd_revoke(p, fd) struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; + struct mount *mp; struct vattr vattr; int error, *retval; @@ -271,8 +272,11 @@ fd_revoke(p, fd) (error = suser(p)) != 0) goto out; + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto out; if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); + vn_finished_write(mp); out: vrele(vp); return error; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index bc8203f..116e011 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -466,6 +466,7 @@ buf_countdeps(struct buf *bp, int i) /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ +#define B_METAONLY 0x04 /* Return indirect block buffer. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index d215351..fb80e5b 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -285,6 +285,7 @@ struct proc { /* Marked a kernel thread */ #define P_BUFEXHAUST 0x100000 /* dirty buffers flush is in progress */ #define P_KTHREADP 0x200000 /* Process is really a kernel thread */ +#define P_COWINPROGRESS 0x400000 /* Snapshot copy-on-write in progress */ #define P_DEADLKTREAT 0x800000 /* lock aquisition - deadlock treatment */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 5817855..3da7897 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -158,7 +158,7 @@ struct vnode { /* open for business 0x00800 */ /* open for business 0x01000 */ #define VOBJBUF 0x02000 /* Allocate buffers in VM object */ -/* open for business 0x04000 */ +#define VCOPYONWRITE 0x04000 /* vnode is doing copy-on-write */ #define VAGE 0x08000 /* Insert vnode at head of free list */ #define VOLOCK 0x10000 /* vnode is locked waiting for an object */ #define VOWANT 0x20000 /* a process is waiting for VOLOCK */ @@ -246,12 +246,15 @@ extern int vttoif_tab[]; /* * Flags to various vnode functions. */ -#define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ -#define FORCECLOSE 0x0002 /* vflush: force file closure */ -#define WRITECLOSE 0x0004 /* vflush: only close writable files */ -#define DOCLOSE 0x0008 /* vclean: close active files */ -#define V_SAVE 0x0001 /* vinvalbuf: sync file first */ -#define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ +#define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ +#define FORCECLOSE 0x0002 /* vflush: force file closure */ +#define WRITECLOSE 0x0004 /* vflush: only close writable files */ +#define DOCLOSE 0x0008 /* vclean: close active files */ +#define V_SAVE 0x0001 /* vinvalbuf: sync file first */ +#define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ +#define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ +#define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ +#define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define VREF(vp) vref(vp) @@ -572,6 +575,7 @@ int vrecycle __P((struct vnode *vp, struct simplelock *inter_lkp, struct proc *p)); int vn_close __P((struct vnode *vp, int flags, struct ucred *cred, struct proc *p)); +void vn_finished_write __P((struct mount *mp)); int vn_isdisk __P((struct vnode *vp, int *errp)); int vn_lock __P((struct vnode *vp, int flags, struct proc *p)); #ifdef DEBUG_LOCKS @@ -587,13 +591,18 @@ int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *cred, int *aresid, struct proc *p)); int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); +int vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags)); dev_t vn_todev __P((struct vnode *vp)); +int vn_write_suspend_wait __P((struct vnode *vp, int flags)); +int vn_writechk __P((struct vnode *vp)); int vfs_cache_lookup __P((struct vop_lookup_args *ap)); int vfs_object_create __P((struct vnode *vp, struct proc *p, struct ucred *cred)); void vfs_timestamp __P((struct timespec *)); -int vn_writechk __P((struct vnode *vp)); +void vfs_write_resume __P((struct mount *mp)); +void vfs_write_suspend __P((struct mount *mp)); int vop_stdbwrite __P((struct vop_bwrite_args *ap)); +int vop_stdgetwritemount __P((struct vop_getwritemount_args *)); int vop_stdislocked __P((struct vop_islocked_args *)); int vop_stdlock __P((struct vop_lock_args *)); int vop_stdunlock __P((struct vop_unlock_args *)); diff --git a/sys/ufs/ffs/README.snapshot b/sys/ufs/ffs/README.snapshot new file mode 100644 index 0000000..f3177c3 --- /dev/null +++ b/sys/ufs/ffs/README.snapshot @@ -0,0 +1,112 @@ +$FreeBSD$ + +Soft Updates Status + +As is detailed in the operational information below, snapshots +are definitely alpha-test code and are NOT yet ready for production +use. Much remains to be done to make them really useful, but I +wanted to let folks get a chance to try it out and start reporting +bugs and other shortcomings. Such reports should be sent to +Kirk McKusick <mckusick@mckusick.com>. + + +Snapshot Copyright Restrictions + +Snapshots have been introduced to FreeBSD with a `Berkeley-style' +copyright. The file implementing snapshots resides in the sys/ufs/ffs +directory and is compiled into the generic kernel by default. + + +Using Snapshots + +To create a snapshot of your /var filesystem, run the command: + + mount -u -o snapshot /var/snapshot/snap1 /var + +This command will take a snapshot of your /var filesystem and +leave it in the file /var/snapshot/snap1. Note that snapshot +files must be created in the filesystem that is being snapshotted. +I use the convention of putting a `snapshot' directory at the +root of each filesystem into which I can place snapshots. +You may create up to 20 snapshots per filesystem. Active snapshots +are recorded in the superblock, so they persist across unmount +and remount operations and across system reboots. When your +are done with a snapshot, it can be removed with the `rm' +command. Snapshots may be removed in any order, however you +may not get back all the space contained in the snapshot as +another snapshot may claim some of the blocks that it is releasing. +Note that the `schg' flag is set on snapshots to ensure that +not even the root user can write to them. The unlink command +makes an exception for snapshot files in that it allows them +to be removed even though they have the `schg' flag set, so it +is not necessary to clear the `schg' flag before removing a +snapshot file. + +Once you have taken a snapshot, there are three interesting +things that you can do with it: + +1) Run fsck on the snapshot file. Assuming that the filesystem + was clean when it was mounted, you should always get a clean + (and unchanging) result from running fsck on the snapshot. + If you are running with soft updates and rebooted after a + crash without cleaning up the filesystem, then fsck of the + snapshot may find missing blocks and inodes or inodes with + link counts that are too high. I have not yet added the + system calls to allow fsck to add these missing resources + back to the filesystem - that will be added once the basic + snapshot code is working properly. So, view those reports + as informational for now. + +2) Run dump on the snapshot. You will get a dump that is + consistent with the filesystem as of the timestamp of the + snapshot. Note that I have not yet changed dump to set the + dumpdates file correctly, so do not use this feature in + production until that fix is made. + +3) Mount the snapshot as a frozen image of the filesystem. + To mount the snapshot /var/snapshot/snap1: + + vnconfig -c vn0c /var/snapshot/snap1 + mount -r /dev/vn0c /mnt + + You can now cruise around your frozen /var filesystem + at /mnt. Everything will be in the same state that it + was at the time the snapshot was taken. The one exception + is that any earlier snapshots will appear as zero length + files. When you are done with the mounted snapshot: + + umount /mnt + vnconfig -u vn0c + + Note that under some circumstances, the process accessing + the frozen filesystem may deadlock. I am aware of this + problem, but the solution is not simple. It requires + using buffer read locks rather than exclusive locks when + traversing the inode indirect blocks. Until this problem + is fixed, you should avoid putting mounted snapshots into + production. + + +Performance + +It takes about 30 seconds to create a snapshot of an 8Gb filesystem. +Of that time 25 seconds is spent in preparation; filesystem activity +is only suspended for the final 5 seconds of that period. Snapshot +removal of an 8Gb filesystem takes about two minutes. Filesystem +activity is never suspended during snapshot removal. + +The suspend time may be expanded by several minutes if a process +is in the midst of removing many files as all the soft updates +backlog must be cleared. Generally snapshots do not slow the system +down appreciably except when removing many small files (i.e., any +file less than 96Kb whose last block is a fragment) that are claimed +by a snapshot. Here, the snapshot code must make a copy of every +released fragment which slows the rate of file removal to about +twenty files per second once the soft updates backlog limit is +reached. + + +How Snapshots Work + +For more general information on snapshots, please see: + http://www.mckusick.com/softdep/ diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 1f24b2b..5efe0e7 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -186,6 +186,8 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) *bpp = 0; fs = ip->i_fs; #ifdef DIAGNOSTIC + if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) + panic("ffs_realloccg: allocation on suspended filesystem"); if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( @@ -763,6 +765,10 @@ ffs_hashalloc(ip, cg, pref, size, allocator) long result; /* XXX why not same type as we return? */ int i, icg = cg; +#ifdef DIAGNOSTIC + if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) + panic("ffs_hashalloc: allocation on suspended filesystem"); +#endif fs = ip->i_fs; /* * 1: preferred cylinder group @@ -1311,9 +1317,13 @@ ffs_blkfree(ip, bno, size) ufs_daddr_t blkno; int i, error, cg, blk, frags, bbase; u_int8_t *blksfree; + struct vnode *vp; fs = ip->i_fs; - VOP_FREEBLKS(ip->i_devvp, fsbtodb(fs, bno), size); +#ifdef DIAGNOSTIC + if ((vp = ITOV(ip)) != NULL && vp->v_mount != NULL && + (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)) + panic("ffs_blkfree: deallocation on suspended filesystem"); if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { printf("dev=%s, bno = %ld, bsize = %ld, size = %ld, fs = %s\n", @@ -1321,6 +1331,11 @@ ffs_blkfree(ip, bno, size) fs->fs_fsmnt); panic("ffs_blkfree: bad size"); } +#endif + if ((ip->i_devvp->v_flag & VCOPYONWRITE) && + ffs_snapblkfree(ip, bno, size)) + return; + VOP_FREEBLKS(ip->i_devvp, fsbtodb(fs, bno), size); cg = dtog(fs, bno); if ((u_int)bno >= fs->fs_size) { printf("bad block %ld, ino %lu\n", diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 28cc1ed..92fe379 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -125,6 +125,8 @@ ffs_balloc(ap) * The first NDADDR blocks are direct blocks */ if (lbn < NDADDR) { + if (flags & B_METAONLY) + panic("ffs_balloc: B_METAONLY for direct block"); nb = ip->i_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); @@ -289,6 +291,13 @@ ffs_balloc(ap) } } /* + * If asked only for the indirect block, then return it. + */ + if (flags & B_METAONLY) { + *ap->a_bpp = bp; + return (0); + } + /* * Get the data block, allocating if necessary. */ if (nb == 0) { diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index fe7391b..8e011bb 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -67,6 +67,7 @@ struct vop_balloc_args; struct vop_bmap_args; struct vop_fsync_args; struct vop_reallocblks_args; +struct vop_copyonwrite_args; int ffs_alloc __P((struct inode *, ufs_daddr_t, ufs_daddr_t, int, struct ucred *, ufs_daddr_t *)); @@ -76,6 +77,7 @@ void ffs_blkfree __P((struct inode *, ufs_daddr_t, long)); ufs_daddr_t ffs_blkpref __P((struct inode *, ufs_daddr_t, int, ufs_daddr_t *)); int ffs_bmap __P((struct vop_bmap_args *)); void ffs_clrblock __P((struct fs *, u_char *, ufs_daddr_t)); +int ffs_copyonwrite __P((struct vop_copyonwrite_args *ap)); int ffs_fhtovp __P((struct mount *, struct fid *, struct vnode **)); int ffs_flushfiles __P((struct mount *, int, struct proc *)); void ffs_fragacct __P((struct fs *, int, int32_t [], int)); @@ -89,6 +91,10 @@ int ffs_reallocblks __P((struct vop_reallocblks_args *)); int ffs_realloccg __P((struct inode *, ufs_daddr_t, ufs_daddr_t, int, int, struct ucred *, struct buf **)); void ffs_setblock __P((struct fs *, u_char *, ufs_daddr_t)); +int ffs_snapblkfree __P((struct inode *freeip, ufs_daddr_t bno, long size)); +int ffs_snapshot __P((struct mount *mp, char *snapfile)); +void ffs_snapshot_mount __P((struct mount *mp)); +void ffs_snapshot_unmount __P((struct mount *mp)); int ffs_statfs __P((struct mount *, struct statfs *, struct proc *)); int ffs_sync __P((struct mount *, int, struct ucred *, struct proc *)); int ffs_truncate __P((struct vnode *, off_t, int, struct ucred *, struct proc *)); diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c new file mode 100644 index 0000000..73da537 --- /dev/null +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -0,0 +1,1028 @@ +/* + * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. + * + * Further information about snapshots can be obtained from: + * + * Marshall Kirk McKusick http://www.mckusick.com/softdep/ + * 1614 Oxford Street mckusick@mckusick.com + * Berkeley, CA 94709-1608 +1-510-843-9542 + * USA + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_snapshot.c 8.10 (McKusick) 7/11/00 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/stat.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/resource.h> +#include <sys/resourcevar.h> +#include <sys/vnode.h> + +#include <ufs/ufs/extattr.h> +#include <ufs/ufs/quota.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/ffs/fs.h> +#include <ufs/ffs/ffs_extern.h> + +#define KERNCRED proc0.p_ucred +#define CURPROC curproc +#define DEBUG + +static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, + int, int, int, int)); +static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *)); +static int readblock __P((struct buf *, daddr_t)); + +#ifdef DEBUG +#include <sys/sysctl.h> +int snapdebug = 0; +SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); +#endif /* DEBUG */ + +/* + * Create a snapshot file and initialize it for the filesystem. + */ +int +ffs_snapshot(mp, snapfile) + struct mount *mp; + char *snapfile; +{ + ufs_daddr_t rlbn; + ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP]; + int error, cg, snaploc, indiroff, numblks; + int i, size, base, len, loc, inoblkcnt; + int blksperindir, flag = mp->mnt_flag; + struct fs *fs = VFSTOUFS(mp)->um_fs; + struct proc *p = CURPROC; + struct inode *devip, *ip, *xp; + struct buf *bp, *nbp, *ibp; + struct vnode *vp, *devvp; + struct nameidata nd; + struct mount *wrtmp; + struct dinode *dip; + struct vattr vat; + struct cg *cgp; + + /* + * Need to serialize access to snapshot code per filesystem. + */ + /* + * Assign a snapshot slot in the superblock. + */ + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) + if (fs->fs_snapinum[snaploc] == 0) + break; + if (snaploc == FSMAXSNAP) + return (ENOSPC); + /* + * Create the snapshot file. + */ +restart: + NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p); + if ((error = namei(&nd)) != 0) + return (error); + if (nd.ni_vp != NULL) { + vput(nd.ni_vp); + error = EEXIST; + } + if (nd.ni_dvp->v_mount != mp) + error = EXDEV; + if (error) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + return (error); + } + VATTR_NULL(&vat); + vat.va_type = VREG; + vat.va_mode = S_IRUSR; + vat.va_vaflags |= VA_EXCLUSIVE; + if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) + wrtmp = NULL; + if (wrtmp != mp) + panic("ffs_snapshot: mount mismatch"); + if (vn_start_write(wrtmp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(wrtmp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE); + error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); + vput(nd.ni_dvp); + if (error) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vn_finished_write(wrtmp); + return (error); + } + vp = nd.ni_vp; + ip = VTOI(vp); + devvp = ip->i_devvp; + devip = VTOI(devvp); + /* + * Allocate and copy the last block contents so as to be able + * to set size to that of the filesystem. + */ + numblks = howmany(fs->fs_size, fs->fs_frag); + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), + fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); + if (error) + goto out; + ip->i_size = lblktosize(fs, (off_t)numblks); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if ((error = readblock(bp, numblks - 1)) != 0) + goto out; + bawrite(bp); + /* + * Preallocate critical data structures so that we can copy + * them in without further allocation after we suspend all + * operations on the filesystem. We would like to just release + * the allocated buffers without writing them since they will + * be filled in below once we are ready to go, but this upsets + * the soft update code, so we go ahead and write the new buffers. + * + * Allocate all indirect blocks. Also allocate shadow copies + * for each of the indirect blocks. + */ + for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); + if (error) + goto out; + copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); + bdwrite(ibp); + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), + fs->fs_bsize, p->p_ucred, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + } + /* + * Allocate shadow blocks to copy all of the other snapshot inodes + * so that we will be able to expunge them from this snapshot. + */ + for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) { + blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); + for (i = 0; i < inoblkcnt; i++) + if (inoblks[i] == blkno) + break; + if (i == inoblkcnt) { + inoblks[inoblkcnt++] = blkno; + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + } + } + /* + * Allocate all cylinder group blocks. + */ + for (cg = 0; cg < fs->fs_ncg; cg++) { + error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + } + /* + * Allocate copies for the superblock and its summary information. + */ + error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, + 0, &nbp); + if (error) + goto out; + bawrite(nbp); + blkno = fragstoblks(fs, fs->fs_csaddr); + len = howmany(fs->fs_cssize, fs->fs_bsize); + for (loc = 0; loc < len; loc++) { + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + } + /* + * Change inode to snapshot type file. + */ + ip->i_flags |= SF_IMMUTABLE | SF_SNAPSHOT; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * Ensure that the snapshot is completely on disk. + */ + if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0) + goto out; + /* + * All allocations are done, so we can now snapshot the system. + * + * Suspend operation on filesystem. + */ + for (;;) { + vn_finished_write(wrtmp); + vfs_write_suspend(vp->v_mount); + if (mp->mnt_kern_flag & MNTK_SUSPENDED) + break; + vn_start_write(wrtmp, V_WAIT); + } + /* + * First, copy all the cylinder group maps. All the unallocated + * blocks are marked BLK_NOCOPY so that the snapshot knows that + * it need not copy them if they are later written. + */ + len = howmany(fs->fs_fpg, fs->fs_frag); + for (cg = 0; cg < fs->fs_ncg; cg++) { + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, KERNCRED, &bp); + if (error) { + brelse(bp); + goto out1; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + error = EIO; + goto out1; + } + error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, + KERNCRED, &nbp); + if (error) { + brelse(bp); + brelse(nbp); + goto out1; + } + bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); + if (fs->fs_cgsize < fs->fs_bsize) + bzero(&nbp->b_data[fs->fs_cgsize], + fs->fs_bsize - fs->fs_cgsize); + bawrite(nbp); + base = cg * fs->fs_fpg / fs->fs_frag; + if (base + len > numblks) + len = numblks - base; + loc = 0; + if (base < NDADDR) { + for ( ; loc < NDADDR; loc++) { + if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) + continue; + ip->i_db[loc] = BLK_NOCOPY; + } + } + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), + fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); + if (error) { + brelse(bp); + goto out1; + } + indiroff = (base + loc - NDADDR) % NINDIR(fs); + for ( ; loc < len; loc++, indiroff++) { + if (indiroff >= NINDIR(fs)) { + bawrite(ibp); + error = VOP_BALLOC(vp, + lblktosize(fs, (off_t)(base + loc)), + fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); + if (error) { + brelse(bp); + goto out1; + } + indiroff = 0; + } + if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) + continue; + ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; + } + brelse(bp); + bdwrite(ibp); + } + /* + * Snapshot the superblock and its summary information. + */ + error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, + 0, &nbp); + if (error) + goto out1; + bcopy(fs, nbp->b_data, fs->fs_sbsize); + ((struct fs *)(nbp->b_data))->fs_clean = 1; + if (fs->fs_sbsize < fs->fs_bsize) + bzero(&nbp->b_data[fs->fs_sbsize], + fs->fs_bsize - fs->fs_sbsize); + bawrite(nbp); + blkno = fragstoblks(fs, fs->fs_csaddr); + len = howmany(fs->fs_cssize, fs->fs_bsize) - 1; + size = fs->fs_bsize; + for (loc = 0; loc <= len; loc++) { + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out1; + if (loc == len) { + readblock(nbp, blkno + loc); + size = fs->fs_cssize % fs->fs_bsize; + } + bcopy(fs->fs_csp[loc], nbp->b_data, size); + bawrite(nbp); + } + /* + * Copy the shadow blocks for the snapshot inodes so that + * the copies can can be expunged. + */ + for (loc = 0; loc < inoblkcnt; loc++) { + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out1; + readblock(nbp, inoblks[loc]); + bdwrite(nbp); + } + /* + * Copy allocation information from other snapshots and then + * expunge them from the view of the current snapshot. + */ + for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) { + /* + * Before expunging a snapshot inode, note all the + * blocks that it claims with BLK_SNAP so that fsck will + * be able to account for those blocks properly and so + * that this snapshot knows that it need not copy them + * if the other snapshot holding them is freed. + */ + if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0) + goto out1; + blksperindir = 1; + lbn = -NDADDR; + len = numblks - NDADDR; + rlbn = NDADDR; + for (i = 0; len > 0 && i < NIADDR; i++) { + error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn, + rlbn, len, blksperindir); + if (error) + goto out1; + blksperindir *= NINDIR(fs); + lbn -= blksperindir + 1; + len -= blksperindir; + rlbn += blksperindir; + } + /* + * Set copied snapshot inode to be a zero length file. + */ + blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number)); + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out1; + dip = (struct dinode *)nbp->b_data + + ino_to_fsbo(fs, xp->i_number); + dip->di_size = 0; + dip->di_blocks = 0; + dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); + bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); + bdwrite(nbp); + } + /* + * Copy all indirect blocks to their shadows (allocated above) + * to avoid deadlock in ffs_copyonwrite. + */ + for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); + if (error) + goto out1; + copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); + brelse(ibp); + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), + fs->fs_bsize, p->p_ucred, 0, &nbp); + if (error) + goto out1; + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); + if (error) { + brelse(nbp); + goto out1; + } + bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize); + brelse(ibp); + bawrite(nbp); + } + /* + * Record snapshot inode. Since this is the newest snapshot, + * it must be placed at the end of the list. + */ + fs->fs_snapinum[snaploc] = ip->i_number; + if (ip->i_copyonwrite != 0) + panic("ffs_snapshot: %d already on list", ip->i_number); + if (devip->i_copyonwrite == 0) { + devvp->v_flag |= VCOPYONWRITE; + devip->i_copyonwrite = ip; + } else { + for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; ) + xp = xp->i_copyonwrite; + xp->i_copyonwrite = ip; + } + vp->v_flag |= VSYSTEM; + /* + * Resume operation on filesystem. + */ +out1: + vfs_write_resume(vp->v_mount); + vn_start_write(wrtmp, V_WAIT); +out: + mp->mnt_flag = flag; + (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0, p); + vn_finished_write(wrtmp); + return (error); +} + +/* + * Descend an indirect block chain for vnode cancelvp accounting for all + * its indirect blocks in snapvp. + */ +static int +indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir) + struct vnode *snapvp; + struct vnode *cancelvp; + int level; + ufs_daddr_t blkno; + int lbn; + int rlbn; + int remblks; + int blksperindir; +{ + int subblksperindir, error, last, num, i; + struct indir indirs[NIADDR + 2]; + ufs_daddr_t *bap; + struct buf *bp; + struct fs *fs; + + if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) + return (error); + if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) + panic("indiracct: botched params"); + /* + * We have to expand bread here since it will deadlock looking + * up the block number for any blocks that are not in the cache. + */ + fs = VTOI(cancelvp)->i_fs; + bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); + bp->b_blkno = fsbtodb(fs, blkno); + if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && + (error = readblock(bp, fragstoblks(fs, blkno)))) { + brelse(bp); + return (error); + } + /* + * Account for the block pointers in this indirect block. + */ + last = howmany(remblks, blksperindir); + if (last > NINDIR(fs)) + last = NINDIR(fs); + if (snapvp != cancelvp) { + bap = (ufs_daddr_t *)bp->b_data; + } else { + MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); + bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); + brelse(bp); + } + error = snapacct(snapvp, &bap[0], &bap[last]); + if (error || level == 0) + goto out; + /* + * Account for the block pointers in each of the indirect blocks + * in the levels below us. + */ + subblksperindir = blksperindir / NINDIR(fs); + for (lbn++, level--, i = 0; i < last; i++) { + error = indiracct(snapvp, cancelvp, level, bap[i], lbn, + rlbn, remblks, subblksperindir); + if (error) + goto out; + rlbn += blksperindir; + lbn -= blksperindir; + remblks -= blksperindir; + } +out: + if (snapvp != cancelvp) + brelse(bp); + else + FREE(bap, M_DEVBUF); + return (error); +} + +/* + * Account for a set of blocks allocated in a snapshot inode. + */ +static int +snapacct(vp, oldblkp, lastblkp) + struct vnode *vp; + ufs_daddr_t *oldblkp, *lastblkp; +{ + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + ufs_daddr_t lbn, blkno, *blkp; + struct buf *ibp; + int error; + + for ( ; oldblkp < lastblkp; oldblkp++) { + blkno = *oldblkp; + if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) + continue; + lbn = fragstoblks(fs, blkno); + if (lbn < NDADDR) { + blkp = &ip->i_db[lbn]; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); + if (error) + return (error); + blkp = &((ufs_daddr_t *)(ibp->b_data)) + [(lbn - NDADDR) % NINDIR(fs)]; + } + if (*blkp != 0) + panic("snapacct: bad block"); + *blkp = BLK_SNAP; + if (lbn >= NDADDR) + bdwrite(ibp); + } + return (0); +} + +/* + * Prepare a snapshot file for being removed. + */ +void +ffs_snapremove(vp) + struct vnode *vp; +{ + struct inode *ip, *xp; + struct vnode *devvp; + struct buf *ibp; + struct fs *fs; + ufs_daddr_t blkno, dblk; + int error, snaploc, loc, last; + + ip = VTOI(vp); + fs = ip->i_fs; + /* + * Delete snapshot inode from superblock. Keep list dense. + */ + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) + if (fs->fs_snapinum[snaploc] == ip->i_number) + break; + if (snaploc < FSMAXSNAP) { + for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { + if (fs->fs_snapinum[snaploc] == 0) + break; + fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; + } + fs->fs_snapinum[snaploc - 1] = 0; + } + /* + * Delete from incore list. + * Clear copy-on-write flag if last snapshot. + */ + devvp = ip->i_devvp; + for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) { + if (xp->i_copyonwrite != ip) + continue; + xp->i_copyonwrite = ip->i_copyonwrite; + ip->i_copyonwrite = 0; + break; + } + if (xp == 0) { + printf("ffs_snapremove: lost snapshot vnode %d\n", + ip->i_number); + vref(vp); + } + if (VTOI(devvp)->i_copyonwrite == 0) + devvp->v_flag &= ~VCOPYONWRITE; + /* + * Clear all BLK_NOCOPY fields. Pass any block claims to other + * snapshots that want them (see ffs_snapblkfree below). + */ + for (blkno = 1; blkno < NDADDR; blkno++) { + dblk = ip->i_db[blkno]; + if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || + (dblk == blkstofrags(fs, blkno) && + ffs_snapblkfree(ip, dblk, fs->fs_bsize))) + ip->i_db[blkno] = 0; + } + for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) { + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); + if (error) + continue; + if ((last = fs->fs_size - blkno) > NINDIR(fs)) + last = NINDIR(fs); + for (loc = 0; loc < last; loc++) { + dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; + if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || + (dblk == blkstofrags(fs, blkno) && + ffs_snapblkfree(ip, dblk, fs->fs_bsize))) + ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; + } + bawrite(ibp); + } + /* + * Clear snapshot flag and drop reference. + */ + ip->i_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + vrele(vp); +} + +/* + * Notification that a block is being freed. Return zero if the free + * should be allowed to proceed. Return non-zero if the snapshot file + * wants to claim the block. The block will be claimed if it is an + * uncopied part of one of the snapshots. It will be freed if it is + * either a BLK_NOCOPY or has already been copied in all of the snapshots. + * If a fragment is being freed, then all snapshots that care about + * it must make a copy since a snapshot file can only claim full sized + * blocks. Note that if more than one snapshot file maps the block, + * we can pick one at random to claim it. Since none of the snapshots + * can change, we are assurred that they will all see the same unmodified + * image. When deleting a snapshot file (see ffs_snapremove above), we + * must push any of these claimed blocks to one of the other snapshots + * that maps it. These claimed blocks are easily identified as they will + * have a block number equal to their logical block number within the + * snapshot. A copied block can never have this property because they + * must always have been allocated from a BLK_NOCOPY location. + */ +int +ffs_snapblkfree(freeip, bno, size) + struct inode *freeip; + ufs_daddr_t bno; + long size; +{ + struct buf *ibp, *cbp, *savedcbp = 0; + struct fs *fs = freeip->i_fs; + struct proc *p = CURPROC; + struct inode *ip; + struct vnode *vp; + ufs_daddr_t lbn, blkno; + int indiroff = 0, error = 0, claimedblk = 0; + + lbn = fragstoblks(fs, bno); + for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip; + ip = ip->i_copyonwrite) { + vp = ITOV(ip); + /* + * Lookup block being written. + */ + if (lbn < NDADDR) { + blkno = ip->i_db[lbn]; + } else { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + p->p_flag |= P_COWINPROGRESS; + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); + p->p_flag &= ~P_COWINPROGRESS; + VOP_UNLOCK(vp, 0, p); + if (error) + break; + indiroff = (lbn - NDADDR) % NINDIR(fs); + blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; + } + /* + * Check to see if block needs to be copied. + */ + switch (blkno) { + /* + * If the snapshot has already copied the block (default), + * or does not care about the block, it is not needed. + */ + default: + case BLK_NOCOPY: + if (lbn >= NDADDR) + brelse(ibp); + continue; + /* + * No previous snapshot claimed the block, so it will be + * freed and become a BLK_NOCOPY (don't care) for us. + */ + case BLK_SNAP: + if (claimedblk) + panic("snapblkfree: inconsistent block type"); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (lbn < NDADDR) { + ip->i_db[lbn] = BLK_NOCOPY; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + ((ufs_daddr_t *)(ibp->b_data))[indiroff] = + BLK_NOCOPY; + bdwrite(ibp); + } + VOP_UNLOCK(vp, 0, p); + continue; + /* + * A block that we map is being freed. If it has not been + * claimed yet, we will claim or copy it (below). + */ + case 0: + claimedblk = 1; + break; + } + /* + * If this is a full size block, we will just grab it + * and assign it to the snapshot inode. Otherwise we + * will proceed to copy it. See explanation for this + * routine as to why only a single snapshot needs to + * claim this block. + */ + if (size == fs->fs_bsize) { +#ifdef DEBUG + if (snapdebug) + printf("%s %d lbn %d from inum %d\n", + "Grabonremove: snapino", ip->i_number, lbn, + freeip->i_number); +#endif + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (lbn < NDADDR) { + ip->i_db[lbn] = bno; + } else { + ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; + bdwrite(ibp); + } + ip->i_blocks += btodb(size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + VOP_UNLOCK(vp, 0, p); + return (1); + } + if (lbn >= NDADDR) + brelse(ibp); + /* + * Allocate the block into which to do the copy. Note that this + * allocation will never require any additional allocations for + * the snapshot inode. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + p->p_flag |= P_COWINPROGRESS; + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, 0, &cbp); + p->p_flag &= ~P_COWINPROGRESS; + VOP_UNLOCK(vp, 0, p); + if (error) + break; +#ifdef DEBUG + if (snapdebug) + printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", + "Copyonremove: snapino ", ip->i_number, lbn, + freeip->i_number, size, cbp->b_blkno); +#endif + /* + * If we have already read the old block contents, then + * simply copy them to the new block. + */ + if (savedcbp != 0) { + bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); + bawrite(cbp); + continue; + } + /* + * Otherwise, read the old block contents into the buffer. + */ + if ((error = readblock(cbp, lbn)) != 0) + break; + savedcbp = cbp; + } + if (savedcbp) + bawrite(savedcbp); + /* + * If we have been unable to allocate a block in which to do + * the copy, then return non-zero so that the fragment will + * not be freed. Although space will be lost, the snapshot + * will stay consistent. + */ + return (error); +} + +/* + * Associate snapshot files when mounting. + */ +void +ffs_snapshot_mount(mp) + struct mount *mp; +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct proc *p = CURPROC; + struct inode *ip, **listtailp; + struct vnode *vp; + int error, snaploc, loc; + + listtailp = &VTOI(ump->um_devvp)->i_copyonwrite; + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { + if (fs->fs_snapinum[snaploc] == 0) + return; + if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ + printf("ffs_snapshot_mount: vget failed %d\n", error); + continue; + } + ip = VTOI(vp); + if ((ip->i_flags & SF_SNAPSHOT) == 0) { + printf("ffs_snapshot_mount: non-snapshot inode %d\n", + fs->fs_snapinum[snaploc]); + vput(vp); + for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { + if (fs->fs_snapinum[loc] == 0) + break; + fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; + } + fs->fs_snapinum[loc - 1] = 0; + snaploc--; + continue; + } + if (ip->i_copyonwrite != 0) + panic("ffs_snapshot_mount: %d already on list", + ip->i_number); + *listtailp = ip; + listtailp = &ip->i_copyonwrite; + vp->v_flag |= VSYSTEM; + VOP_UNLOCK(vp, 0, p); + ump->um_devvp->v_flag |= VCOPYONWRITE; + } +} + +/* + * Disassociate snapshot files when unmounting. + */ +void +ffs_snapshot_unmount(mp) + struct mount *mp; +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *devip = VTOI(ump->um_devvp); + struct inode *xp; + + while ((xp = devip->i_copyonwrite) != 0) { + devip->i_copyonwrite = xp->i_copyonwrite; + xp->i_copyonwrite = 0; + vrele(ITOV(xp)); + } + ump->um_devvp->v_flag &= ~VCOPYONWRITE; +} + +/* + * Check for need to copy block that is about to be written, + * copying the block if necessary. + */ +int +ffs_copyonwrite(ap) + struct vop_copyonwrite_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap; +{ + struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp; + struct fs *fs = VTOI(bp->b_vp)->i_fs; + struct proc *p = CURPROC; + struct inode *ip; + struct vnode *vp; + ufs_daddr_t lbn, blkno; + int indiroff, error = 0; + + lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); + if (p->p_flag & P_COWINPROGRESS) + panic("ffs_copyonwrite: recursive call"); + for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) { + vp = ITOV(ip); + /* + * We ensure that everything of our own that needs to be + * copied will be done at the time that ffs_snapshot is + * called. Thus we can skip the check here which can + * deadlock in doing the lookup in VOP_BALLOC. + */ + if (bp->b_vp == vp) + continue; + /* + * Check to see if block needs to be copied. + */ + if (lbn < NDADDR) { + blkno = ip->i_db[lbn]; + } else { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + p->p_flag |= P_COWINPROGRESS; + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); + p->p_flag &= ~P_COWINPROGRESS; + VOP_UNLOCK(vp, 0, p); + if (error) + break; + indiroff = (lbn - NDADDR) % NINDIR(fs); + blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; + brelse(ibp); + } +#ifdef DIAGNOSTIC + if (blkno == BLK_SNAP && bp->b_lblkno >= 0) + panic("ffs_copyonwrite: bad copy block"); +#endif + if (blkno != 0) + continue; + /* + * Allocate the block into which to do the copy. Note that this + * allocation will never require any additional allocations for + * the snapshot inode. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + p->p_flag |= P_COWINPROGRESS; + error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, 0, &cbp); + p->p_flag &= ~P_COWINPROGRESS; + VOP_UNLOCK(vp, 0, p); +#ifdef DEBUG + if (snapdebug) { + printf("Copyonwrite: snapino %d lbn %d for ", + ip->i_number, lbn); + if (bp->b_vp == ap->a_vp) + printf("fs metadata"); + else + printf("inum %d", VTOI(bp->b_vp)->i_number); + printf(" lblkno %d to blkno %d\n", bp->b_lblkno, + cbp->b_blkno); + } +#endif + if (error) + break; + /* + * If we have already read the old block contents, then + * simply copy them to the new block. + */ + if (savedcbp != 0) { + bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); + bawrite(cbp); + continue; + } + /* + * Otherwise, read the old block contents into the buffer. + */ + if ((error = readblock(cbp, lbn)) != 0) + break; + savedcbp = cbp; + } + if (savedcbp) + bawrite(savedcbp); + return (error); +} + +/* + * Read the specified block into the given buffer. + * Much of this boiler-plate comes from bwrite(). + */ +static int +readblock(bp, lbn) + struct buf *bp; + daddr_t lbn; +{ + struct uio auio; + struct iovec aiov; + struct proc *p = CURPROC; + struct inode *ip = VTOI(bp->b_vp); + + aiov.iov_base = bp->b_data; + aiov.iov_len = bp->b_bcount; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); + auio.uio_resid = bp->b_bcount; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = p; + return (physio(ip->i_devvp->v_rdev, &auio, 0)); +} diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 40e9669..d9e6414 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -508,7 +508,7 @@ softdep_process_worklist(matchmnt) { struct proc *p = CURPROC; struct worklist *wk; - struct fs *matchfs; + struct mount *mp; int matchcnt, loopcount; /* @@ -517,9 +517,6 @@ softdep_process_worklist(matchmnt) */ filesys_syncer = p; matchcnt = 0; - matchfs = NULL; - if (matchmnt != NULL) - matchfs = VFSTOUFS(matchmnt)->um_fs; /* * There is no danger of having multiple processes run this * code. It is single threaded solely so that softdep_flushfiles @@ -550,30 +547,42 @@ softdep_process_worklist(matchmnt) case D_DIRREM: /* removal of a directory entry */ - if (WK_DIRREM(wk)->dm_mnt == matchmnt) + mp = WK_DIRREM(wk)->dm_mnt; + if (mp == matchmnt) matchcnt += 1; + vn_start_write(NULL, &mp, V_WAIT); handle_workitem_remove(WK_DIRREM(wk)); + vn_finished_write(mp); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ - if (WK_FREEBLKS(wk)->fb_fs == matchfs) + mp = WK_FREEBLKS(wk)->fb_mnt; + if (mp == matchmnt) matchcnt += 1; + vn_start_write(NULL, &mp, V_WAIT); handle_workitem_freeblocks(WK_FREEBLKS(wk)); + vn_finished_write(mp); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ - if (WK_FREEFRAG(wk)->ff_fs == matchfs) + mp = WK_FREEFRAG(wk)->ff_mnt; + if (mp == matchmnt) matchcnt += 1; + vn_start_write(NULL, &mp, V_WAIT); handle_workitem_freefrag(WK_FREEFRAG(wk)); + vn_finished_write(mp); break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ - if (WK_FREEFILE(wk)->fx_fs == matchfs) + mp = WK_FREEFILE(wk)->fx_mnt; + if (mp == matchmnt) matchcnt += 1; + vn_start_write(NULL, &mp, V_WAIT); handle_workitem_freefile(WK_FREEFILE(wk)); + vn_finished_write(mp); break; default: @@ -1316,7 +1325,7 @@ newfreefrag(ip, blkno, size) freefrag->ff_list.wk_type = D_FREEFRAG; freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ freefrag->ff_inum = ip->i_number; - freefrag->ff_fs = fs; + freefrag->ff_mnt = ITOV(ip)->v_mount; freefrag->ff_devvp = ip->i_devvp; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; @@ -1333,7 +1342,8 @@ handle_workitem_freefrag(freefrag) { struct inode tip; - tip.i_fs = freefrag->ff_fs; + tip.i_vnode = NULL; + tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs; tip.i_devvp = freefrag->ff_devvp; tip.i_dev = freefrag->ff_devvp->v_rdev; tip.i_number = freefrag->ff_inum; @@ -1601,7 +1611,7 @@ softdep_setup_freeblocks(ip, length) freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; - freeblks->fb_fs = fs; + freeblks->fb_mnt = ITOV(ip)->v_mount; freeblks->fb_oldsize = ip->i_size; freeblks->fb_newsize = length; freeblks->fb_chkcnt = ip->i_blocks; @@ -1845,7 +1855,7 @@ softdep_freefile(pvp, ino, mode) freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; - freefile->fx_fs = ip->i_fs; + freefile->fx_mnt = ITOV(ip)->v_mount; /* * If the inodedep does not exist, then the zero'ed inode has @@ -1949,13 +1959,13 @@ handle_workitem_freeblocks(freeblks) int error, allerror = 0; ufs_lbn_t baselbns[NIADDR], tmpval; + tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; tip.i_number = freeblks->fb_previousinum; tip.i_devvp = freeblks->fb_devvp; tip.i_dev = freeblks->fb_devvp->v_rdev; - tip.i_fs = freeblks->fb_fs; tip.i_size = freeblks->fb_oldsize; tip.i_uid = freeblks->fb_uid; - fs = freeblks->fb_fs; + tip.i_vnode = NULL; tmpval = 1; baselbns[0] = NDADDR; for (i = 1; i < NIADDR; i++) { @@ -2715,20 +2725,23 @@ static void handle_workitem_freefile(freefile) struct freefile *freefile; { + struct fs *fs; struct vnode vp; struct inode tip; struct inodedep *idp; int error; + fs = VFSTOUFS(freefile->fx_mnt)->um_fs; #ifdef DEBUG ACQUIRE_LOCK(&lk); - if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) + if (inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp)) panic("handle_workitem_freefile: inodedep survived"); FREE_LOCK(&lk); #endif tip.i_devvp = freefile->fx_devvp; tip.i_dev = freefile->fx_devvp->v_rdev; - tip.i_fs = freefile->fx_fs; + tip.i_fs = fs; + tip.i_vnode = &vp; vp.v_data = &tip; if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0) softdep_error("handle_workitem_freefile", error); @@ -4419,14 +4432,18 @@ clear_remove(p) mp = pagedep->pd_mnt; ino = pagedep->pd_ino; FREE_LOCK(&lk); + if (vn_start_write(NULL, &mp, V_WAIT | PCATCH) != 0) + return; if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_remove: vget", error); + vn_finished_write(mp); return; } if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) softdep_error("clear_remove: fsync", error); drain_output(vp, 0); vput(vp); + vn_finished_write(mp); return; } } @@ -4486,8 +4503,11 @@ clear_inodedeps(p) if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) continue; FREE_LOCK(&lk); + if (vn_start_write(NULL, &mp, V_WAIT | PCATCH) != 0) + return; if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_inodedeps: vget", error); + vn_finished_write(mp); return; } if (ino == lastino) { @@ -4499,6 +4519,7 @@ clear_inodedeps(p) drain_output(vp, 0); } vput(vp); + vn_finished_write(mp); ACQUIRE_LOCK(&lk); } FREE_LOCK(&lk); diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 89ff6d3..5280181 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -133,7 +133,7 @@ VFS_SET(ufs_vfsops, ufs, 0); * namei() if it is a genuine NULL from the user. */ static int -ffs_mount( mp, path, data, ndp, p) +ffs_mount(mp, path, data, ndp, p) struct mount *mp; /* mount struct pointer*/ char *path; /* path to mount point*/ caddr_t data; /* arguments to FS specific mount*/ @@ -141,49 +141,34 @@ ffs_mount( mp, path, data, ndp, p) struct proc *p; /* process requesting mount*/ { size_t size; - int err = 0; struct vnode *devvp; - struct ufs_args args; struct ufsmount *ump = 0; register struct fs *fs; - int error, flags, ronly = 0; + int error, flags; mode_t accessmode; /* - * Use NULL path to flag a root mount + * Use NULL path to indicate we are mounting the root file system. */ - if( path == NULL) { - /* - *** - * Mounting root file system - *** - */ - - if ((err = bdevvp(rootdev, &rootvp))) { + if (path == NULL) { + if ((error = bdevvp(rootdev, &rootvp))) { printf("ffs_mountroot: can't find rootvp\n"); - return (err); - } - - if( ( err = ffs_mountfs(rootvp, mp, p, M_FFSNODE)) != 0) { - /* fs specific cleanup (if any)*/ - goto error_1; + return (error); } - goto dostatfs; /* success*/ + if ((error = ffs_mountfs(rootvp, mp, p, M_FFSNODE)) != 0) + return (error); + (void)VFS_STATFS(mp, &mp->mnt_stat, p); + return (0); } /* - *** * Mounting non-root file system or updating a file system - *** */ - - /* copy in user arguments*/ - err = copyin(data, (caddr_t)&args, sizeof (struct ufs_args)); - if (err) - goto error_1; /* can't get arguments*/ + if ((error = copyin(data, (caddr_t)&args, sizeof(struct ufs_args)))!= 0) + return (error); /* * If updating, check whether changing from read-only to @@ -193,25 +178,36 @@ ffs_mount( mp, path, data, ndp, p) ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; - err = 0; - ronly = fs->fs_ronly; /* MNT_RELOAD might change this */ - if (ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) + return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mp->mnt_flag & MNT_SOFTDEP) { - err = softdep_flushfiles(mp, flags, p); + error = softdep_flushfiles(mp, flags, p); } else { - err = ffs_flushfiles(mp, flags, p); + error = ffs_flushfiles(mp, flags, p); } - ronly = 1; - } - if (!err && (mp->mnt_flag & MNT_RELOAD)) - err = ffs_reload(mp, ndp->ni_cnd.cn_cred, p); - if (err) { - goto error_1; + if (error) { + vn_finished_write(mp); + return (error); + } + fs->fs_ronly = 1; + if ((fs->fs_flags & FS_UNCLEAN) == 0) + fs->fs_clean = 1; + if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) { + fs->fs_ronly = 0; + fs->fs_clean = 0; + vn_finished_write(mp); + return (error); + } + vn_finished_write(mp); } - if (ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { + if ((mp->mnt_flag & MNT_RELOAD) && + (error = ffs_reload(mp, ndp->ni_cnd.cn_cred, p)) != 0) + return (error); + if (fs->fs_ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. @@ -225,31 +221,36 @@ ffs_mount( mp, path, data, ndp, p) } VOP_UNLOCK(devvp, 0, p); } - fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (mp->mnt_flag & MNT_FORCE) { - printf( -"WARNING: %s was not properly dismounted\n", - fs->fs_fsmnt); + printf("WARNING: %s was not %s\n", + fs->fs_fsmnt, "properly dismounted"); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); - err = EPERM; - goto error_1; + return (EPERM); } } - + if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) + return (error); + fs->fs_ronly = 0; + fs->fs_clean = 0; + if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) { + vn_finished_write(mp); + return (error); + } /* check to see if we need to start softdep */ - if (fs->fs_flags & FS_DOSOFTDEP) { - err = softdep_mount(devvp, mp, fs, p->p_ucred); - if (err) - goto error_1; + if ((fs->fs_flags & FS_DOSOFTDEP) && + (error = softdep_mount(devvp, mp, fs, p->p_ucred))){ + vn_finished_write(mp); + return (error); } - - ronly = 0; + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + vn_finished_write(mp); } /* * Soft updates is incompatible with "async", @@ -258,18 +259,18 @@ ffs_mount( mp, path, data, ndp, p) * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ - if (mp->mnt_flag & MNT_SOFTDEP) { + if (mp->mnt_flag & MNT_SOFTDEP) mp->mnt_flag &= ~MNT_ASYNC; - } - /* if not updating name...*/ - if (args.fspec == 0) { - /* - * Process export requests. Jumping to "success" - * will return the vfs_export() error code. - */ - err = vfs_export(mp, &ump->um_export, &args.export); - goto success; - } + /* + * If not updating name, process export requests. + */ + if (args.fspec == 0) + return (vfs_export(mp, &ump->um_export, &args.export)); + /* + * If this is a snapshot request, take the snapshot. + */ + if (mp->mnt_flag & MNT_SNAPSHOT) + return (ffs_snapshot(mp, args.fspec)); } /* @@ -277,17 +278,14 @@ ffs_mount( mp, path, data, ndp, p) * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); - err = namei(ndp); - if (err) { - /* can't get devvp!*/ - goto error_1; - } - + if ((error = namei(ndp)) != 0) + return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; - - if (!vn_isdisk(devvp, &err)) - goto error_2; + if (!vn_isdisk(devvp, &error)) { + vrele(devvp); + return (error); + } /* * If mount by non-root, then verify that user has necessary @@ -298,7 +296,7 @@ ffs_mount( mp, path, data, ndp, p) if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); - if ((error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p)) != 0) { + if ((error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p))!= 0){ vput(devvp); return (error); } @@ -307,96 +305,43 @@ ffs_mount( mp, path, data, ndp, p) if (mp->mnt_flag & MNT_UPDATE) { /* - ******************** - * UPDATE + * Update only + * * If it's not the same vnode, or at least the same device * then it's not correct. - ******************** */ - if (devvp != ump->um_devvp) { - if ( devvp->v_rdev == ump->um_devvp->v_rdev) { - vrele(devvp); - } else { - err = EINVAL; /* needs translation */ - } - } else - vrele(devvp); - /* - * Update device name only on success - */ - if( !err) { - /* Save "mounted from" info for mount point (NULL pad)*/ - copyinstr( args.fspec, - mp->mnt_stat.f_mntfromname, - MNAMELEN - 1, - &size); - bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); - } + if (devvp != ump->um_devvp && + devvp->v_rdev != ump->um_devvp->v_rdev) + error = EINVAL; /* needs translation */ + vrele(devvp); + if (error) + return (error); } else { /* - ******************** - * NEW MOUNT - ******************** + * New mount + * + * We need the name for the mount point (also used for + * "last mounted on") copied in. If an error occurs, + * the mount point is discarded by the upper level code. */ - - /* - * Since this is a new mount, we want the names for - * the device and the mount point copied in. If an - * error occurs, the mountpoint is discarded by the - * upper level code. - */ - /* Save "last mounted on" info for mount point (NULL pad)*/ - copyinstr( path, /* mount point*/ - mp->mnt_stat.f_mntonname, /* save area*/ - MNAMELEN - 1, /* max size*/ - &size); /* real size*/ + copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero( mp->mnt_stat.f_mntonname + size, MNAMELEN - size); - - /* Save "mounted from" info for mount point (NULL pad)*/ - copyinstr( args.fspec, /* device name*/ - mp->mnt_stat.f_mntfromname, /* save area*/ - MNAMELEN - 1, /* max size*/ - &size); /* real size*/ - bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); - - err = ffs_mountfs(devvp, mp, p, M_FFSNODE); - } - if (err) { - goto error_2; + if ((error = ffs_mountfs(devvp, mp, p, M_FFSNODE)) != 0) { + vrele(devvp); + return (error); + } } - -dostatfs: /* - * Initialize FS stat information in mount struct; uses both - * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname - * - * This code is common to root and non-root mounts + * Save "mounted from" device name info for mount point (NULL pad). + */ + copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); + bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + /* + * Initialize filesystem stat information in mount struct. */ (void)VFS_STATFS(mp, &mp->mnt_stat, p); - - goto success; - - -error_2: /* error with devvp held*/ - - /* release devvp before failing*/ - vrele(devvp); - -error_1: /* no state to back out*/ - -success: - if (!err && path && (mp->mnt_flag & MNT_UPDATE)) { - /* Update clean flag after changing read-onlyness. */ - fs = ump->um_fs; - if (ronly != fs->fs_ronly) { - fs->fs_ronly = ronly; - fs->fs_clean = ronly && - (fs->fs_flags & FS_UNCLEAN) == 0 ? 1 : 0; - ffs_sbupdate(ump, MNT_WAIT); - } - } - return (err); + return (0); } /* @@ -478,7 +423,7 @@ ffs_reload(mp, cred, p) newfs->fs_maxcluster = fs->fs_maxcluster; bcopy(newfs, fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) - bp->b_flags |= B_INVAL; + bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat(fs); @@ -670,7 +615,7 @@ ffs_mountfs(devvp, mp, p, malloctype) ump->um_vfree = ffs_vfree; bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) - bp->b_flags |= B_INVAL; + bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; @@ -750,6 +695,8 @@ ffs_mountfs(devvp, mp, p, malloctype) free(base, M_UFSMNT); goto out; } + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT); @@ -886,6 +833,15 @@ ffs_flushfiles(mp, flags, p) */ } #endif + if (ump->um_devvp->v_flag & VCOPYONWRITE) { + if ((error = vflush(mp, NULL, SKIPSYSTEM | flags)) != 0) + return (error); + ffs_snapshot_unmount(mp); + /* + * Here we fall through to vflush again to ensure + * that we have gotten rid of all the system vnodes. + */ + } /* * Flush all the files. */ diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 539f302..eb6d621 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -95,6 +95,7 @@ vop_t **ffs_specop_p; static struct vnodeopv_entry_desc ffs_specop_entries[] = { { &vop_default_desc, (vop_t *) ufs_vnoperatespec }, { &vop_fsync_desc, (vop_t *) ffs_fsync }, + { &vop_copyonwrite_desc, (vop_t *) ffs_copyonwrite }, { NULL, NULL } }; static struct vnodeopv_desc ffs_specop_opv_desc = @@ -129,11 +130,20 @@ ffs_fsync(ap) } */ *ap; { struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); struct buf *bp; struct buf *nbp; int s, error, wait, passes, skipmeta; daddr_t lbn; + /* + * Snapshots have to be unlocked so they do not deadlock + * checking whether they need to copy their written buffers. + * We always hold a reference, so they cannot be removed + * out from underneath us. + */ + if (ip->i_flags & SF_SNAPSHOT) + VOP_UNLOCK(vp, 0, ap->a_p); wait = (ap->a_waitfor == MNT_WAIT); if (vn_isdisk(vp, NULL)) { lbn = INT_MAX; @@ -141,8 +151,6 @@ ffs_fsync(ap) (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) softdep_fsync_mountdev(vp); } else { - struct inode *ip; - ip = VTOI(vp); lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); } @@ -279,5 +287,7 @@ loop: } splx(s); error = UFS_UPDATE(vp, wait); + if (ip->i_flags & SF_SNAPSHOT) + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); return (error); } diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h index 1908a3e..cf9cac8 100644 --- a/sys/ufs/ffs/softdep.h +++ b/sys/ufs/ffs/softdep.h @@ -382,7 +382,7 @@ struct freefrag { struct worklist ff_list; /* id_inowait or delayed worklist */ # define ff_state ff_list.wk_state /* owning user; should be uid_t */ struct vnode *ff_devvp; /* filesystem device vnode */ - struct fs *ff_fs; /* addr of superblock */ + struct mount *ff_mnt; /* associated mount point */ ufs_daddr_t ff_blkno; /* fragment physical block number */ long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ @@ -398,7 +398,7 @@ struct freeblks { struct worklist fb_list; /* id_inowait or delayed worklist */ ino_t fb_previousinum; /* inode of previous owner of blocks */ struct vnode *fb_devvp; /* filesystem device vnode */ - struct fs *fb_fs; /* addr of superblock */ + struct mount *fb_mnt; /* associated mount point */ off_t fb_oldsize; /* previous file size */ off_t fb_newsize; /* new file size */ int fb_chkcnt; /* used to check cnt of blks released */ @@ -418,7 +418,7 @@ struct freefile { mode_t fx_mode; /* mode of inode */ ino_t fx_oldinum; /* inum of the unlinked file */ struct vnode *fx_devvp; /* filesystem device vnode */ - struct fs *fx_fs; /* addr of superblock */ + struct mount *fx_mnt; /* associated mount point */ }; /* diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index 83960b0..6417a10 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -84,6 +84,7 @@ struct inode { struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ u_quad_t i_modrev; /* Revision level for NFS lease. */ struct lockf *i_lockf;/* Head of byte-level lock list. */ + struct inode *i_copyonwrite; /* copy-on-write list */ /* * Side effects; used during directory lookup. */ diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c index 9056340..ab4ac52 100644 --- a/sys/ufs/ufs/ufs_bmap.c +++ b/sys/ufs/ufs/ufs_bmap.c @@ -47,6 +47,7 @@ #include <sys/vnode.h> #include <sys/mount.h> #include <sys/resourcevar.h> +#include <sys/stat.h> #include <ufs/ufs/extattr.h> #include <ufs/ufs/quota.h> @@ -115,7 +116,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct indir a[NIADDR+1], *xap; ufs_daddr_t daddr; long metalbn; - int error, maxrun, num; + int error, num, maxrun = 0; ip = VTOI(vp); mp = vp->v_mount; @@ -127,6 +128,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) #endif if (runp) { + maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; *runp = 0; } @@ -134,7 +136,6 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) *runb = 0; } - maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; xap = ap == NULL ? a : ap; if (!nump) @@ -146,9 +147,12 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); - if (*bnp == 0) - *bnp = -1; - else if (runp) { + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); @@ -226,8 +230,13 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) if (bp) bqrelse(bp); - daddr = blkptrtodb(ump, daddr); - *bnp = daddr == 0 ? -1 : daddr; + *bnp = blkptrtodb(ump, daddr); + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } return (0); } diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index d576be9..b740792 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -87,6 +87,7 @@ int ufs_init __P((struct vfsconf *)); void ufs_itimes __P((struct vnode *vp)); int ufs_lookup __P((struct vop_cachedlookup_args *)); int ufs_reclaim __P((struct vop_reclaim_args *)); +void ffs_snapremove __P((struct vnode *vp)); int ufs_root __P((struct mount *, struct vnode **)); int ufs_start __P((struct mount *, int, struct proc *)); int ufs_vinit __P((struct mount *, vop_t **, vop_t **, struct vnode **)); diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c index 507e716..485a6d2 100644 --- a/sys/ufs/ufs/ufs_inode.c +++ b/sys/ufs/ufs/ufs_inode.c @@ -77,6 +77,7 @@ ufs_inactive(ap) if (ip->i_mode == 0) goto out; if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + (void) vn_write_suspend_wait(vp, V_WAIT); #ifdef QUOTA if (!getinoquota(ip)) (void)chkiq(ip, -1, NOCRED, 0); @@ -91,8 +92,15 @@ ufs_inactive(ap) ip->i_flag |= IN_CHANGE | IN_UPDATE; UFS_VFREE(vp, ip->i_number, mode); } - if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) - UFS_UPDATE(vp, 0); + if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { + if ((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && + vn_write_suspend_wait(vp, V_NOWAIT)) { + ip->i_flag &= ~IN_ACCESS; + } else { + (void) vn_write_suspend_wait(vp, V_WAIT); + UFS_UPDATE(vp, 0); + } + } out: VOP_UNLOCK(vp, 0, p); /* diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c index 574a330..6396f67 100644 --- a/sys/ufs/ufs/ufs_quota.c +++ b/sys/ufs/ufs/ufs_quota.c @@ -889,6 +889,7 @@ dqsync(vp, dq) struct vnode *dqvp; struct iovec aiov; struct uio auio; + struct mount *mp; int error; if (dq == NODQUOT) @@ -897,6 +898,7 @@ dqsync(vp, dq) return (0); if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) panic("dqsync: file"); + (void) vn_write_suspend_wait(dqvp, V_WAIT); if (vp != dqvp) vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, p); while (dq->dq_flags & DQ_LOCK) { diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index e3b6e29..d97568c 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -702,6 +702,8 @@ ufs_remove(ap) int error; ip = VTOI(vp); + if ((ip->i_flags & SF_SNAPSHOT) != 0) + ffs_snapremove(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; @@ -2215,6 +2217,7 @@ static struct vnodeopv_entry_desc ufs_vnodeop_entries[] = { { &vop_open_desc, (vop_t *) ufs_open }, { &vop_pathconf_desc, (vop_t *) ufs_pathconf }, { &vop_poll_desc, (vop_t *) vop_stdpoll }, + { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_readdir_desc, (vop_t *) ufs_readdir }, { &vop_readlink_desc, (vop_t *) ufs_readlink }, diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 307dd0b..97b221e 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -777,6 +777,7 @@ rescan0: int written; int swap_pageouts_ok; struct vnode *vp = NULL; + struct mount *mp; object = m->object; @@ -853,9 +854,13 @@ rescan0: if (object->type == OBJT_VNODE) { vp = object->handle; + mp = NULL; + if (vp->v_type == VREG) + vn_start_write(vp, &mp, V_NOWAIT); if (VOP_ISLOCKED(vp, NULL) || vp->v_data == NULL || vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) { + vn_finished_write(mp); if ((m->queue == PQ_INACTIVE) && (m->hold_count == 0) && (m->busy == 0) && @@ -878,6 +883,7 @@ rescan0: if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; vput(vp); + vn_finished_write(mp); continue; } @@ -888,6 +894,7 @@ rescan0: */ if (m->busy || (m->flags & PG_BUSY)) { vput(vp); + vn_finished_write(mp); continue; } @@ -902,6 +909,7 @@ rescan0: if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; vput(vp); + vn_finished_write(mp); continue; } } @@ -913,8 +921,10 @@ rescan0: * start the cleaning operation. */ written = vm_pageout_clean(m); - if (vp) + if (vp) { vput(vp); + vn_finished_write(mp); + } maxlaunder -= written; } diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 2633426..3dd12ec 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -850,6 +850,7 @@ vnode_pager_putpages(object, m, count, sync, rtvals) { int rtval; struct vnode *vp; + struct mount *mp; int bytes = count * PAGE_SIZE; /* @@ -872,11 +873,15 @@ vnode_pager_putpages(object, m, count, sync, rtvals) */ vp = object->handle; + if (vp->v_type != VREG) + mp = NULL; + (void)vn_start_write(vp, &mp, V_WAIT); rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0); if (rtval == EOPNOTSUPP) { printf("vnode_pager: *** WARNING *** stale FS putpages\n"); rtval = vnode_pager_generic_putpages( vp, m, bytes, sync, rtvals); } + vn_finished_write(mp); } |