From 5bb0933cf1cda47a3d3a201d7c18c19beee0c55d Mon Sep 17 00:00:00 2001 From: trasz Date: Sun, 18 Nov 2012 18:57:19 +0000 Subject: Add UFS writesuspension mechanism, designed to allow userland processes to modify on-disk metadata for filesystems mounted for write. Reviewed by: kib, mckusick Sponsored by: FreeBSD Foundation --- sys/ufs/ffs/ffs_extern.h | 4 + sys/ufs/ffs/ffs_suspend.c | 338 ++++++++++++++++++++++++++++++++++++++++++++++ sys/ufs/ffs/ffs_vfsops.c | 43 ++++-- sys/ufs/ffs/fs.h | 8 ++ sys/ufs/ufs/ufsmount.h | 1 + 5 files changed, 384 insertions(+), 10 deletions(-) create mode 100644 sys/ufs/ffs/ffs_suspend.c (limited to 'sys/ufs') diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index 4db351d..918383d 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -79,9 +79,11 @@ int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); int ffs_mountroot(void); void ffs_oldfscompat_write(struct fs *, struct ufsmount *); +int ffs_own_mount(const struct mount *mp); int ffs_reallocblks(struct vop_reallocblks_args *); int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, int, int, struct ucred *, struct buf **); +int ffs_reload(struct mount *, struct thread *, int); int ffs_sbupdate(struct ufsmount *, int, int); void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); int ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, @@ -100,6 +102,8 @@ int ffs_valloc(struct vnode *, int, struct ucred *, struct vnode **); int ffs_vfree(struct vnode *, ino_t, int); vfs_vget_t ffs_vget; int ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int); +void ffs_susp_initialize(void); +void ffs_susp_uninitialize(void); #define FFSV_FORCEINSMQ 0x0001 diff --git a/sys/ufs/ffs/ffs_suspend.c b/sys/ufs/ffs/ffs_suspend.c new file mode 100644 index 0000000..9d8e2c1 --- /dev/null +++ b/sys/ufs/ffs/ffs_suspend.c @@ -0,0 +1,338 @@ +/*- + * Copyright (c) 2012 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Edward Tomasz Napierala under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +static d_open_t ffs_susp_open; +static d_write_t ffs_susp_rdwr; +static d_ioctl_t ffs_susp_ioctl; + +static struct cdevsw ffs_susp_cdevsw = { + .d_version = D_VERSION, + .d_open = ffs_susp_open, + .d_read = ffs_susp_rdwr, + .d_write = ffs_susp_rdwr, + .d_ioctl = ffs_susp_ioctl, + .d_name = "ffs_susp", +}; + +static struct cdev *ffs_susp_dev; +static struct sx ffs_susp_lock; + +static int +ffs_susp_suspended(struct mount *mp) +{ + struct ufsmount *ump; + + sx_assert(&ffs_susp_lock, SA_LOCKED); + + ump = VFSTOUFS(mp); + if (ump->um_writesuspended) + return (1); + return (0); +} + +static int +ffs_susp_open(struct cdev *dev __unused, int flags __unused, + int fmt __unused, struct thread *td __unused) +{ + + return (0); +} + +static int +ffs_susp_rdwr(struct cdev *dev, struct uio *uio, int ioflag) +{ + int error, i; + struct vnode *devvp; + struct mount *mp; + struct ufsmount *ump; + struct buf *bp; + void *base; + size_t len; + ssize_t cnt; + struct fs *fs; + + sx_slock(&ffs_susp_lock); + + error = devfs_get_cdevpriv((void **)&mp); + if (error != 0) { + sx_sunlock(&ffs_susp_lock); + return (ENXIO); + } + + ump = VFSTOUFS(mp); + devvp = ump->um_devvp; + fs = ump->um_fs; + + if (ffs_susp_suspended(mp) == 0) { + sx_sunlock(&ffs_susp_lock); + return (ENXIO); + } + + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + ("neither UIO_READ or UIO_WRITE")); + KASSERT(uio->uio_segflg == UIO_USERSPACE, + ("uio->uio_segflg != UIO_USERSPACE")); + + cnt = uio->uio_resid; + + for (i = 0; i < uio->uio_iovcnt; i++) { + while (uio->uio_iov[i].iov_len) { + base = uio->uio_iov[i].iov_base; + len = uio->uio_iov[i].iov_len; + if (len > fs->fs_bsize) + len = fs->fs_bsize; + if (fragoff(fs, uio->uio_offset) != 0 || + fragoff(fs, len) != 0) { + error = EINVAL; + goto out; + } + error = bread(devvp, btodb(uio->uio_offset), len, + NOCRED, &bp); + if (error != 0) + goto out; + if (uio->uio_rw == UIO_WRITE) { + error = copyin(base, bp->b_data, len); + if (error != 0) { + bp->b_flags |= B_INVAL | B_NOCACHE; + brelse(bp); + goto out; + } + error = bwrite(bp); + if (error != 0) + goto out; + } else { + error = copyout(bp->b_data, base, len); + brelse(bp); + if (error != 0) + goto out; + } + uio->uio_iov[i].iov_base = + (char *)uio->uio_iov[i].iov_base + len; + uio->uio_iov[i].iov_len -= len; + uio->uio_resid -= len; + uio->uio_offset += len; + } + } + +out: + sx_sunlock(&ffs_susp_lock); + + if (uio->uio_resid < cnt) + return (0); + + return (error); +} + +static int +ffs_susp_suspend(struct mount *mp) +{ + struct fs *fs; + struct ufsmount *ump; + int error; + + sx_assert(&ffs_susp_lock, SA_XLOCKED); + + if (!ffs_own_mount(mp)) + return (EINVAL); + if (ffs_susp_suspended(mp)) + return (EBUSY); + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + + /* + * Make sure the calling thread is permitted to access the mounted + * device. The permissions can change after we unlock the vnode; + * it's harmless. + */ + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_ACCESS(ump->um_devvp, VREAD | VWRITE, + curthread->td_ucred, curthread); + VOP_UNLOCK(ump->um_devvp, 0); + if (error != 0) + return (error); +#ifdef MAC + if (mac_mount_check_stat(curthread->td_ucred, mp) != 0) + return (EPERM); +#endif + + if ((error = vfs_write_suspend(mp)) != 0) + return (error); + + ump->um_writesuspended = 1; + + return (0); +} + +static void +ffs_susp_dtor(void *data) +{ + struct fs *fs; + struct ufsmount *ump; + struct mount *mp; + int error; + + sx_xlock(&ffs_susp_lock); + + mp = (struct mount *)data; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + + if (ffs_susp_suspended(mp) == 0) { + sx_xunlock(&ffs_susp_lock); + return; + } + + KASSERT((mp->mnt_kern_flag & MNTK_SUSPEND) != 0, + ("MNTK_SUSPEND not set")); + + error = ffs_reload(mp, curthread, 1); + if (error != 0) + panic("failed to unsuspend writes on %s", fs->fs_fsmnt); + + /* + * XXX: The status is kept per-process; the vfs_write_resume() routine + * asserts that the resuming thread is the same one that called + * vfs_write_suspend(). The cdevpriv data, however, is attached + * to the file descriptor, e.g. is inherited during fork. Thus, + * it's possible that the resuming process will be different from + * the one that started the suspension. + * + * Work around by fooling the check in vfs_write_resume(). + */ + mp->mnt_susp_owner = curthread; + + vfs_write_resume(mp); + vfs_unbusy(mp); + ump->um_writesuspended = 0; + + sx_xunlock(&ffs_susp_lock); +} + +static int +ffs_susp_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) +{ + struct mount *mp; + fsid_t *fsidp; + int error; + + /* + * No suspend inside the jail. Allowing it would require making + * sure that e.g. the devfs ruleset for that jail permits access + * to the devvp. + */ + if (jailed(td->td_ucred)) + return (EPERM); + + sx_xlock(&ffs_susp_lock); + + switch (cmd) { + case UFSSUSPEND: + fsidp = (fsid_t *)addr; + mp = vfs_getvfs(fsidp); + if (mp == NULL) { + error = ENOENT; + break; + } + error = vfs_busy(mp, 0); + vfs_rel(mp); + if (error != 0) + break; + error = ffs_susp_suspend(mp); + if (error != 0) { + vfs_unbusy(mp); + break; + } + error = devfs_set_cdevpriv(mp, ffs_susp_dtor); + KASSERT(error == 0, ("devfs_set_cdevpriv failed")); + break; + case UFSRESUME: + error = devfs_get_cdevpriv((void **)&mp); + if (error != 0) + break; + /* + * This calls ffs_susp_dtor, which in turn unsuspends the fs. + * The dtor expects to be called without lock held, because + * sometimes it's called from here, and sometimes due to the + * file being closed or process exiting. + */ + sx_xunlock(&ffs_susp_lock); + devfs_clear_cdevpriv(); + return (0); + default: + error = ENXIO; + break; + } + + sx_xunlock(&ffs_susp_lock); + + return (error); +} + +void +ffs_susp_initialize(void) +{ + + sx_init(&ffs_susp_lock, "ffs_susp"); + ffs_susp_dev = make_dev(&ffs_susp_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "ufssuspend"); +} + +void +ffs_susp_uninitialize(void) +{ + + destroy_dev(ffs_susp_dev); + sx_destroy(&ffs_susp_lock); +} diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index cc54ece..0a43d80 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -75,7 +76,6 @@ __FBSDID("$FreeBSD$"); static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; -static int ffs_reload(struct mount *, struct thread *); static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); @@ -333,7 +333,7 @@ ffs_mount(struct mount *mp) vfs_write_resume(mp); } if ((mp->mnt_flag & MNT_RELOAD) && - (error = ffs_reload(mp, td)) != 0) + (error = ffs_reload(mp, td, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { @@ -595,8 +595,8 @@ ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) /* * Reload all incore data for a filesystem (used after running fsck on - * the root filesystem and finding things to fix). The filesystem must - * be mounted read-only. + * the root filesystem and finding things to fix). If the 'force' flag + * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. @@ -606,8 +606,8 @@ ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ -static int -ffs_reload(struct mount *mp, struct thread *td) +int +ffs_reload(struct mount *mp, struct thread *td, int force) { struct vnode *vp, *mvp, *devvp; struct inode *ip; @@ -619,9 +619,15 @@ ffs_reload(struct mount *mp, struct thread *td) int i, blks, size, error; int32_t *lp; - if ((mp->mnt_flag & MNT_RDONLY) == 0) - return (EINVAL); ump = VFSTOUFS(mp); + + MNT_ILOCK(mp); + if ((mp->mnt_flag & MNT_RDONLY) == 0 && force == 0) { + MNT_IUNLOCK(mp); + return (EINVAL); + } + MNT_IUNLOCK(mp); + /* * Step 1: invalidate all cached meta-data. */ @@ -655,8 +661,7 @@ ffs_reload(struct mount *mp, struct thread *td) newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; - /* The file system is still read-only. */ - newfs->fs_ronly = 1; + newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); @@ -711,6 +716,13 @@ ffs_reload(struct mount *mp, struct thread *td) loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* + * Skip syncer vnode. + */ + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + continue; + } + /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { @@ -1834,6 +1846,7 @@ ffs_init(vfsp) struct vfsconf *vfsp; { + ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } @@ -1849,6 +1862,7 @@ ffs_uninit(vfsp) ret = ufs_uninit(vfsp); softdep_uninitialize(); + ffs_susp_uninitialize(); return (ret); } @@ -2198,6 +2212,15 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp) g_vfs_strategy(bo, bp); } +int +ffs_own_mount(const struct mount *mp) +{ + + if (mp->mnt_op == &ufs_vfsops) + return (1); + return (0); +} + #ifdef DDB static void diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h index c992b1d..dbefad3 100644 --- a/sys/ufs/ffs/fs.h +++ b/sys/ufs/ffs/fs.h @@ -33,6 +33,8 @@ #ifndef _UFS_FFS_FS_H_ #define _UFS_FFS_FS_H_ +#include + /* * Each disk drive contains some number of filesystems. * A filesystem consists of a number of cylinder groups. @@ -763,4 +765,10 @@ CTASSERT(sizeof(union jrec) == JREC_SIZE); extern int inside[], around[]; extern u_char *fragtbl[]; +/* + * IOCTLs used for filesystem write suspension. + */ +#define UFSSUSPEND _IOW('U', 1, fsid_t) +#define UFSRESUME _IO('U', 2) + #endif diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index 86a6d14..ac0603e 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -98,6 +98,7 @@ struct ufsmount { char um_qflags[MAXQUOTAS]; /* quota specific flags */ int64_t um_savedmaxfilesize; /* XXX - limit maxfilesize */ int um_candelete; /* devvp supports TRIM */ + int um_writesuspended; /* suspension in progress */ int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, struct buf **); int (*um_blkatoff)(struct vnode *, off_t, char **, struct buf **); int (*um_truncate)(struct vnode *, off_t, int, struct ucred *); -- cgit v1.1