11 files changed, 571 insertions, 154 deletions
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index 7d7866c..82506fb 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -1829,7 +1829,7 @@ gotit:
 	}
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
+		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode);
 	bdwrite(bp);
 	if (ibp != NULL)
 		bawrite(ibp);
@@ -2038,7 +2038,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
 	    ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) {
 		return;
 	}
-	if (!ump->um_candelete) {
+	/*
+	 * Nothing to delay if TRIM is disabled, or the operation is
+	 * performed on the snapshot.
+	 */
+	if (!ump->um_candelete || devvp->v_type == VREG) {
 		ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
 		return;
 	}
@@ -2377,6 +2381,18 @@ ffs_fserr(fs, inum, cp)
  *	in the current directory is oldvalue then change it to newvalue.
  * unlink(nameptr, oldvalue) - Verify that the inode number associated
  *	with nameptr in the current directory is oldvalue then unlink it.
+ *
+ * The following functions may only be used on a quiescent filesystem
+ * by the soft updates journal. They are not safe to be run on an active
+ * filesystem.
+ *
+ * setinode(inode, dip) - the specified disk inode is replaced with the
+ *	contents pointed to by dip.
+ * setbufoutput(fd, flags) - output associated with the specified file
+ *	descriptor (which must reference the character device supporting
+ *	the filesystem) switches from using physio to running through the
+ *	buffer cache when flags is set to 1. The descriptor reverts to
+ *	physio for output when flags is set to zero.
  */
 
 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
@@ -2423,11 +2439,21 @@ static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR,
 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Unlink a Duplicate Name");
 
+static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR,
+	sysctl_ffs_fsck, "Update an On-Disk Inode");
+
+static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR,
+	sysctl_ffs_fsck, "Set Buffered Writing for Descriptor");
+
+#define DEBUG 1
 #ifdef DEBUG
-static int fsckcmds = 0;
+static int fsckcmds = 1;
 SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "");
 #endif /* DEBUG */
 
+static int buffered_write(struct file *, struct uio *, struct ucred *,
+	int, struct thread *);
+
 static int
 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 {
@@ -2441,8 +2467,10 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 	ufs2_daddr_t blkno;
 	long blkcnt, blksize;
 	struct filedesc *fdp;
-	struct file *fp;
+	struct file *fp, *vfp;
 	int vfslocked, filetype, error;
+	static struct fileops *origops, bufferedops;
+	static int outcnt = 0;
 
 	if (req->newlen > sizeof cmd)
 		return (EBADRPC);
@@ -2450,7 +2478,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 		return (error);
 	if (cmd.version != FFS_CMD_VERSION)
 		return (ERPCMISMATCH);
-	if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0)
+	if ((error = getvnode(td->td_proc->p_fd, cmd.handle, &fp)) != 0)
 		return (error);
 	vp = fp->f_data;
 	if (vp->v_type != VREG && vp->v_type != VDIR) {
@@ -2463,12 +2491,13 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 		fdrop(fp, td);
 		return (EINVAL);
 	}
-	if (mp->mnt_flag & MNT_RDONLY) {
+	ump = VFSTOUFS(mp);
+	if ((mp->mnt_flag & MNT_RDONLY) &&
+	    ump->um_fsckpid != td->td_proc->p_pid) {
 		vn_finished_write(mp);
 		fdrop(fp, td);
 		return (EROFS);
 	}
-	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	filetype = IFREG;
 
@@ -2489,7 +2518,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 	case FFS_ADJ_REFCNT:
 #ifdef DEBUG
 		if (fsckcmds) {
-			printf("%s: adjust inode %jd count by %jd\n",
+			printf("%s: adjust inode %jd link count by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
@@ -2500,7 +2529,8 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 		ip->i_nlink += cmd.size;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_effnlink += cmd.size;
-		ip->i_flag |= IN_CHANGE;
+		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+		error = ffs_update(vp, 1);
 		if (DOINGSOFTDEP(vp))
 			softdep_change_linkcnt(ip);
 		vput(vp);
@@ -2518,7 +2548,8 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 			break;
 		ip = VTOI(vp);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
-		ip->i_flag |= IN_CHANGE;
+		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+		error = ffs_update(vp, 1);
 		vput(vp);
 		break;
 
@@ -2718,6 +2749,78 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 		    UIO_USERSPACE, (ino_t)cmd.size);
 		break;
 
+	case FFS_SET_INODE:
+		if (ump->um_fsckpid != td->td_proc->p_pid) {
+			error = EPERM;
+			break;
+		}
+#ifdef DEBUG
+		if (fsckcmds && outcnt++ < 100) {
+			printf("%s: update inode %jd\n",
+			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
+		}
+#endif /* DEBUG */
+		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
+			break;
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		AUDIT_ARG_VNODE1(vp);
+		ip = VTOI(vp);
+		if (ip->i_ump->um_fstype == UFS1)
+			error = copyin((void *)(intptr_t)cmd.size, ip->i_din1,
+			    sizeof(struct ufs1_dinode));
+		else
+			error = copyin((void *)(intptr_t)cmd.size, ip->i_din2,
+			    sizeof(struct ufs2_dinode));
+		if (error) {
+			vput(vp);
+			VFS_UNLOCK_GIANT(vfslocked);
+			break;
+		}
+		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+		error = ffs_update(vp, 1);
+		vput(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		break;
+
+	case FFS_SET_BUFOUTPUT:
+		if (ump->um_fsckpid != td->td_proc->p_pid) {
+			error = EPERM;
+			break;
+		}
+		if (VTOI(vp)->i_ump != ump) {
+			error = EINVAL;
+			break;
+		}
+#ifdef DEBUG
+		if (fsckcmds) {
+			printf("%s: %s buffered output for descriptor %jd\n",
+			    mp->mnt_stat.f_mntonname,
+			    cmd.size == 1 ? "enable" : "disable",
+			    (intmax_t)cmd.value);
+		}
+#endif /* DEBUG */
+		if ((error = getvnode(td->td_proc->p_fd, cmd.value, &vfp)) != 0)
+			break;
+		if (vfp->f_vnode->v_type != VCHR) {
+			fdrop(vfp, td);
+			error = EINVAL;
+			break;
+		}
+		if (origops == NULL) {
+			origops = vfp->f_ops;
+			bcopy((void *)origops, (void *)&bufferedops,
+			    sizeof(bufferedops));
+			bufferedops.fo_write = buffered_write;
+		}
+		if (cmd.size == 1)
+			atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
+			    (uintptr_t)&bufferedops);
+		else
+			atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
+			    (uintptr_t)origops);
+		fdrop(vfp, td);
+		break;
+
 	default:
 #ifdef DEBUG
 		if (fsckcmds) {
@@ -2733,3 +2836,73 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 	vn_finished_write(mp);
 	return (error);
 }
+
+/*
+ * Function to switch a descriptor to use the buffer cache to stage
+ * its I/O. This is needed so that writes to the filesystem device
+ * will give snapshots a chance to copy modified blocks for which it
+ * needs to retain copies.
+ */
+static int
+buffered_write(fp, uio, active_cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *active_cred;
+	int flags;
+	struct thread *td;
+{
+	struct vnode *devvp;
+	struct inode *ip;
+	struct buf *bp;
+	struct fs *fs;
+	int error, vfslocked;
+	daddr_t lbn;
+	static int outcnt = 0;
+
+	/*
+	 * The devvp is associated with the /dev filesystem. To discover
+	 * the filesystem with which the device is associated, we depend
+	 * on the application setting the current directory to a location
+	 * within the filesystem being written. Yes, this is an ugly hack.
+	 */
+	devvp = fp->f_vnode;
+	ip = VTOI(td->td_proc->p_fd->fd_cdir);
+	if (ip->i_devvp != devvp)
+		return (EINVAL);
+	fs = ip->i_fs;
+	vfslocked = VFS_LOCK_GIANT(ip->i_vnode->v_mount);
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = fp->f_offset;
+#ifdef DEBUG
+	if (fsckcmds && outcnt++ < 100) {
+		printf("%s: buffered write for block %jd\n",
+		    fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset));
+	}
+#endif /* DEBUG */
+	/*
+	 * All I/O must be contained within a filesystem block, start on
+	 * a fragment boundary, and be a multiple of fragments in length.
+	 */
+	if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) ||
+	    fragoff(fs, uio->uio_offset) != 0 ||
+	    fragoff(fs, uio->uio_resid) != 0) {
+		error = EINVAL;
+		goto out;
+	}
+	lbn = numfrags(fs, uio->uio_offset);
+	bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0);
+	bp->b_flags |= B_RELBUF;
+	if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) {
+		brelse(bp);
+		goto out;
+	}
+	error = bwrite(bp);
+	if ((flags & FOF_OFFSET) == 0)
+		fp->f_offset = uio->uio_offset;
+	fp->f_nextoff = uio->uio_offset;
+out:
+	VOP_UNLOCK(devvp, 0);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index 63a4eba..f6b078f 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -234,9 +234,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
 	if (num < 1)
 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
 #endif
-	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
-	    TDP_INBDFLUSH);
-	curthread->td_pflags |= TDP_INBDFLUSH;
+	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
 	/*
 	 * Fetch the first indirect block allocating if necessary.
 	 */
@@ -250,7 +248,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags, cred, &newb)) != 0) {
-			curthread->td_pflags &= saved_inbdflush;
+			curthread_pflags_restore(saved_inbdflush);
 			return (error);
 		}
 		nb = newb;
@@ -356,7 +354,7 @@ retry:
 	 * If asked only for the indirect block, then return it.
 	 */
 	if (flags & BA_METAONLY) {
-		curthread->td_pflags &= saved_inbdflush;
+		curthread_pflags_restore(saved_inbdflush);
 		*bpp = bp;
 		return (0);
 	}
@@ -406,7 +404,7 @@ retry:
 				bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
-		curthread->td_pflags &= saved_inbdflush;
+		curthread_pflags_restore(saved_inbdflush);
 		*bpp = nbp;
 		return (0);
 	}
@@ -428,11 +426,11 @@ retry:
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 	}
-	curthread->td_pflags &= saved_inbdflush;
+	curthread_pflags_restore(saved_inbdflush);
 	*bpp = nbp;
 	return (0);
 fail:
-	curthread->td_pflags &= saved_inbdflush;
+	curthread_pflags_restore(saved_inbdflush);
 	/*
 	 * If we have failed to allocate any blocks, simply return the error.
 	 * This is the usual case and avoids the need to fsync the file.
@@ -774,9 +772,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
 	if (num < 1)
 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
 #endif
-	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
-	    TDP_INBDFLUSH);
-	curthread->td_pflags |= TDP_INBDFLUSH;
+	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
 	/*
 	 * Fetch the first indirect block allocating if necessary.
 	 */
@@ -790,7 +786,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags, cred, &newb)) != 0) {
-			curthread->td_pflags &= saved_inbdflush;
+			curthread_pflags_restore(saved_inbdflush);
 			return (error);
 		}
 		nb = newb;
@@ -896,7 +892,7 @@ retry:
 	 * If asked only for the indirect block, then return it.
 	 */
 	if (flags & BA_METAONLY) {
-		curthread->td_pflags &= saved_inbdflush;
+		curthread_pflags_restore(saved_inbdflush);
 		*bpp = bp;
 		return (0);
 	}
@@ -946,7 +942,7 @@ retry:
 				bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
-		curthread->td_pflags &= saved_inbdflush;
+		curthread_pflags_restore(saved_inbdflush);
 		*bpp = nbp;
 		return (0);
 	}
@@ -974,11 +970,11 @@ retry:
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 	}
-	curthread->td_pflags &= saved_inbdflush;
+	curthread_pflags_restore(saved_inbdflush);
 	*bpp = nbp;
 	return (0);
 fail:
-	curthread->td_pflags &= saved_inbdflush;
+	curthread_pflags_restore(saved_inbdflush);
 	/*
 	 * If we have failed to allocate any blocks, simply return the error.
 	 * This is the usual case and avoids the need to fsync the file.
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index fb1b1fb..70bcf1d 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -33,7 +33,10 @@
 #ifndef _UFS_FFS_EXTERN_H
 #define	_UFS_FFS_EXTERN_H
 
-enum vtype;
+#ifndef _KERNEL
+#error "No user-serving parts inside"
+#else
+
 struct buf;
 struct cg;
 struct fid;
@@ -127,7 +130,7 @@ void	softdep_freefile(struct vnode *, ino_t, int);
 int	softdep_request_cleanup(struct fs *, struct vnode *,
 	    struct ucred *, int);
 void	softdep_setup_freeblocks(struct inode *, off_t, int);
-void	softdep_setup_inomapdep(struct buf *, struct inode *, ino_t);
+void	softdep_setup_inomapdep(struct buf *, struct inode *, ino_t, int);
 void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,
 	    int, int);
 void	softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t,
@@ -168,7 +171,6 @@ void	softdep_freework(struct workhead *);
 
 int	ffs_rdonly(struct inode *);
 
-#ifdef _KERNEL
 TAILQ_HEAD(snaphead, inode);
 
 struct snapdata {
@@ -178,6 +180,7 @@ struct snapdata {
 	daddr_t *sn_blklist;
 	struct lock sn_lock;
 };
+
 #endif /* _KERNEL */
 
 #endif /* !_UFS_FFS_EXTERN_H */
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index a7b43e2..0034029 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -90,7 +90,7 @@ ffs_update(vp, waitfor)
 		return (0);
 	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
 	fs = ip->i_fs;
-	if (fs->fs_ronly)
+	if (fs->fs_ronly && ip->i_ump->um_fsckpid == 0)
 		return (0);
 	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		(int)fs->fs_bsize, NOCRED, &bp);
@@ -128,7 +128,7 @@ ffs_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
-	vm_object_page_remove(object, start, end, FALSE);
+	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_UNLOCK(object);
 }
 
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
index 8d236bd..c8dd4c6 100644
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -1918,7 +1918,8 @@ retry:
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
-		if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0)
+		if ((vtype == VDIR || dopersistence) &&
+		    VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT);
 	}
 	/*
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 3734a5d..fccb296 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -142,10 +142,11 @@ softdep_setup_sbupdate(ump, fs, bp)
 }
 
 void
-softdep_setup_inomapdep(bp, ip, newinum)
+softdep_setup_inomapdep(bp, ip, newinum, mode)
 	struct buf *bp;
 	struct inode *ip;
 	ino_t newinum;
+	int mode;
 {
 
 	panic("softdep_setup_inomapdep called");
@@ -789,6 +790,8 @@ static  void diradd_inode_written(struct diradd *, struct inodedep *);
 static	int handle_written_indirdep(struct indirdep *, struct buf *,
 	    struct buf**);
 static	int handle_written_inodeblock(struct inodedep *, struct buf *);
+static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
+	    uint8_t *);
 static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
 static	void handle_written_jaddref(struct jaddref *);
 static	void handle_written_jremref(struct jremref *);
@@ -820,6 +823,8 @@ static	void handle_allocindir_partdone(struct allocindir *);
 static	void initiate_write_filepage(struct pagedep *, struct buf *);
 static	void initiate_write_indirdep(struct indirdep*, struct buf *);
 static	void handle_written_mkdir(struct mkdir *, int);
+static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
+	    uint8_t *);
 static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
 static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
@@ -875,6 +880,7 @@ static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
 static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
 	    ufs_lbn_t, int);
 static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
+static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
 static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
 ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
 static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
@@ -935,6 +941,7 @@ static	void wake_worklist(struct worklist *);
 static	void wait_worklist(struct worklist *, char *);
 static	void remove_from_worklist(struct worklist *);
 static	void softdep_flush(void);
+static	void softdep_flushjournal(struct mount *);
 static	int softdep_speedup(void);
 static	void worklist_speedup(void);
 static	int journal_mount(struct mount *, struct fs *, struct ucred *);
@@ -3046,6 +3053,25 @@ jfsync_write(jfsync, jseg, data)
 	rec->jt_extsize = jfsync->jfs_extsize;
 }
 
+static void
+softdep_flushjournal(mp)
+	struct mount *mp;
+{
+	struct jblocks *jblocks;
+	struct ufsmount *ump;
+
+	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
+		return;
+	ump = VFSTOUFS(mp);
+	jblocks = ump->softdep_jblocks;
+	ACQUIRE_LOCK(&lk);
+	while (ump->softdep_on_journal) {
+		jblocks->jb_needseg = 1;
+		softdep_process_journal(mp, NULL, MNT_WAIT);
+	}
+	FREE_LOCK(&lk);
+}
+
 /*
  * Flush some journal records to disk.
  */
@@ -4310,7 +4336,6 @@ softdep_setup_create(dp, ip)
 		    inoreflst);
 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 		    ("softdep_setup_create: No addref structure present."));
-		jaddref->ja_mode = ip->i_mode;
 	}
 	softdep_prelink(dvp, NULL);
 	FREE_LOCK(&lk);
@@ -4417,7 +4442,6 @@ softdep_setup_mkdir(dp, ip)
 		KASSERT(jaddref->ja_parent == dp->i_number, 
 		    ("softdep_setup_mkdir: bad parent %d",
 		    jaddref->ja_parent));
-		jaddref->ja_mode = ip->i_mode;
 		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
 		    if_deps);
 	}
@@ -4637,10 +4661,11 @@ softdep_revert_rmdir(dp, ip)
  * Called just after updating the cylinder group block to allocate an inode.
  */
 void
-softdep_setup_inomapdep(bp, ip, newinum)
+softdep_setup_inomapdep(bp, ip, newinum, mode)
 	struct buf *bp;		/* buffer for cylgroup block with inode map */
 	struct inode *ip;	/* inode related to allocation */
 	ino_t newinum;		/* new inode number being allocated */
+	int mode;
 {
 	struct inodedep *inodedep;
 	struct bmsafemap *bmsafemap;
@@ -4657,7 +4682,7 @@ softdep_setup_inomapdep(bp, ip, newinum)
 	 * can be dependent on it.
 	 */
 	if (mp->mnt_kern_flag & MNTK_SUJ) {
-		jaddref = newjaddref(ip, newinum, 0, 0, 0);
+		jaddref = newjaddref(ip, newinum, 0, 0, mode);
 		jaddref->ja_state |= NEWBLOCK;
 	}
 
@@ -5014,14 +5039,12 @@ jnewblk_merge(new, old, wkhd)
 	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
 		panic("jnewblk_merge: Merging disparate blocks.");
 	/*
-	 * The record may be rolled back in the cg update bits
-	 * appropriately.  NEWBLOCK here alerts the cg rollback code
-	 * that the frag bits have changed.
+	 * The record may be rolled back in the cg.
 	 */
 	if (jnewblk->jn_state & UNDONE) {
-		njnewblk->jn_state |= UNDONE | NEWBLOCK;
-		njnewblk->jn_state &= ~ATTACHED;
 		jnewblk->jn_state &= ~UNDONE;
+		njnewblk->jn_state |= UNDONE;
+		njnewblk->jn_state &= ~ATTACHED;
 	}
 	/*
 	 * We modify the newer addref and free the older so that if neither
@@ -5176,7 +5199,7 @@ newfreefrag(ip, blkno, size, lbn)
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
 
-	if (fs->fs_flags & FS_SUJ) {
+	if ((ip->i_ump->um_mountp->mnt_kern_flag & MNTK_SUJ) != 0) {
 		freefrag->ff_jdep = (struct worklist *)
 		    newjfreefrag(freefrag, ip, blkno, size, lbn);
 	} else {
@@ -5729,7 +5752,6 @@ newfreeblks(mp, ip)
 	freeblks->fb_modrev = DIP(ip, i_modrev);
 	freeblks->fb_devvp = ip->i_devvp;
 	freeblks->fb_chkcnt = 0;
-	freeblks->fb_freecnt = 0;
 	freeblks->fb_len = 0;
 
 	return (freeblks);
@@ -6177,7 +6199,7 @@ softdep_journal_freeblocks(ip, cred, length, flags)
 	quotaref(vp, freeblks->fb_quota);
 	(void) chkdq(ip, -datablocks, NOCRED, 0);
 #endif
-	freeblks->fb_chkcnt = datablocks;
+	freeblks->fb_chkcnt = -datablocks;
 	UFS_LOCK(ip->i_ump);
 	fs->fs_pendingblocks += datablocks;
 	UFS_UNLOCK(ip->i_ump);
@@ -6407,7 +6429,7 @@ softdep_setup_freeblocks(ip, length, flags)
 	quotaref(vp, freeblks->fb_quota);
 	(void) chkdq(ip, -datablocks, NOCRED, 0);
 #endif
-	freeblks->fb_chkcnt = datablocks;
+	freeblks->fb_chkcnt = -datablocks;
 	UFS_LOCK(ip->i_ump);
 	fs->fs_pendingblocks += datablocks;
 	UFS_UNLOCK(ip->i_ump);
@@ -7262,8 +7284,8 @@ freework_freeblock(freework)
 		freeblks->fb_cgwait++;
 		WORKLIST_INSERT(&wkhd, &freework->fw_list);
 	}
-	freeblks->fb_freecnt += btodb(bsize);
 	FREE_LOCK(&lk);
+	freeblks_free(ump, freeblks, btodb(bsize));
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
 	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
 	ACQUIRE_LOCK(&lk);
@@ -7437,6 +7459,33 @@ handle_workitem_freeblocks(freeblks, flags)
 }
 
 /*
+ * Handle completion of block free via truncate.  This allows fs_pending
+ * to track the actual free block count more closely than if we only updated
+ * it at the end.  We must be careful to handle cases where the block count
+ * on free was incorrect.
+ */
+static void
+freeblks_free(ump, freeblks, blocks)
+	struct ufsmount *ump;
+	struct freeblks *freeblks;
+	int blocks;
+{
+	struct fs *fs;
+	ufs2_daddr_t remain;
+
+	UFS_LOCK(ump);
+	remain = -freeblks->fb_chkcnt;
+	freeblks->fb_chkcnt += blocks;
+	if (remain > 0) {
+		if (remain < blocks)
+			blocks = remain;
+		fs = ump->um_fs;
+		fs->fs_pendingblocks -= blocks;
+	}
+	UFS_UNLOCK(ump);
+}
+
+/*
  * Once all of the freework workitems are complete we can retire the
  * freeblocks dependency and any journal work awaiting completion.  This
  * can not be called until all other dependencies are stable on disk.
@@ -7456,7 +7505,7 @@ handle_complete_freeblocks(freeblks, flags)
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	flags = LK_EXCLUSIVE | flags;
-	spare = freeblks->fb_freecnt - freeblks->fb_chkcnt;
+	spare = freeblks->fb_chkcnt;
 
 	/*
 	 * If we did not release the expected number of blocks we may have
@@ -7479,9 +7528,9 @@ handle_complete_freeblocks(freeblks, flags)
 		}
 		vput(vp);
 	}
-	if (freeblks->fb_chkcnt) {
+	if (spare < 0) {
 		UFS_LOCK(ump);
-		fs->fs_pendingblocks -= freeblks->fb_chkcnt;
+		fs->fs_pendingblocks += spare;
 		UFS_UNLOCK(ump);
 	}
 #ifdef QUOTA
@@ -7537,7 +7586,7 @@ indir_trunc(freework, dbn, lbn)
 	ufs2_daddr_t nb, nnb, *bap2 = 0;
 	ufs_lbn_t lbnadd, nlbn;
 	int i, nblocks, ufs1fmt;
-	int fs_pendingblocks;
+	int freedblocks;
 	int goingaway;
 	int freedeps;
 	int needj;
@@ -7679,16 +7728,18 @@ indir_trunc(freework, dbn, lbn)
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
-	fs_pendingblocks = 0;
+	freedblocks = 0;
 	if (level == 0)
-		fs_pendingblocks = (nblocks * cnt);
+		freedblocks = (nblocks * cnt);
+	if (needj == 0)
+		freedblocks += nblocks;
+	freeblks_free(ump, freeblks, freedblocks);
 	/*
 	 * If we are journaling set up the ref counts and offset so this
 	 * indirect can be completed when its children are free.
 	 */
 	if (needj) {
 		ACQUIRE_LOCK(&lk);
-		freeblks->fb_freecnt += fs_pendingblocks;
 		freework->fw_off = i;
 		freework->fw_ref += freedeps;
 		freework->fw_ref -= NINDIR(fs) + 1;
@@ -7702,12 +7753,10 @@ indir_trunc(freework, dbn, lbn)
 	/*
 	 * If we're not journaling we can free the indirect now.
 	 */
-	fs_pendingblocks += nblocks;
 	dbn = dbtofsb(fs, dbn);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
 	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
 	/* Non SUJ softdep does single-threaded truncations. */
-	freeblks->fb_freecnt += fs_pendingblocks;
 	if (freework->fw_blkno == dbn) {
 		freework->fw_state |= ALLCOMPLETE;
 		ACQUIRE_LOCK(&lk);
@@ -8879,7 +8928,7 @@ softdep_setup_sbupdate(ump, fs, bp)
 	struct sbdep *sbdep;
 	struct worklist *wk;
 
-	if ((fs->fs_flags & FS_SUJ) == 0)
+	if ((ump->um_mountp->mnt_kern_flag & MNTK_SUJ) == 0)
 		return;
 	LIST_FOREACH(wk, &bp->b_dep, wk_list)
 		if (wk->wk_type == D_SBDEP)
@@ -10233,6 +10282,70 @@ softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
 	FREE_LOCK(&lk);
 }
 
+/*
+ * Revert a block allocation when the journal record that describes it
+ * is not yet written.
+ */
+int
+jnewblk_rollback(jnewblk, fs, cgp, blksfree)
+	struct jnewblk *jnewblk;
+	struct fs *fs;
+	struct cg *cgp;
+	uint8_t *blksfree;
+{
+	ufs1_daddr_t fragno;
+	long cgbno, bbase;
+	int frags, blk;
+	int i;
+
+	frags = 0;
+	cgbno = dtogd(fs, jnewblk->jn_blkno);
+	/*
+	 * We have to test which frags need to be rolled back.  We may
+	 * be operating on a stale copy when doing background writes.
+	 */
+	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
+		if (isclr(blksfree, cgbno + i))
+			frags++;
+	if (frags == 0)
+		return (0);
+	/*
+	 * This is mostly ffs_blkfree() sans some validation and
+	 * superblock updates.
+	 */
+	if (frags == fs->fs_frag) {
+		fragno = fragstoblks(fs, cgbno);
+		ffs_setblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, 1);
+		cgp->cg_cs.cs_nbfree++;
+	} else {
+		cgbno += jnewblk->jn_oldfrags;
+		bbase = cgbno - fragnum(fs, cgbno);
+		/* Decrement the old frags.  */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
+		/* Deallocate the fragment */
+		for (i = 0; i < frags; i++)
+			setbit(blksfree, cgbno + i);
+		cgp->cg_cs.cs_nffree += frags;
+		/* Add back in counts associated with the new frags */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+                /* If a complete block has been reassembled, account for it. */
+		fragno = fragstoblks(fs, bbase);
+		if (ffs_isblock(fs, blksfree, fragno)) {
+			cgp->cg_cs.cs_nffree -= fs->fs_frag;
+			ffs_clusteracct(fs, cgp, fragno, 1);
+			cgp->cg_cs.cs_nbfree++;
+		}
+	}
+	stat_jnewblk++;
+	jnewblk->jn_state &= ~ATTACHED;
+	jnewblk->jn_state |= UNDONE;
+
+	return (frags);
+}
+
 static void 
 initiate_write_bmsafemap(bmsafemap, bp)
 	struct bmsafemap *bmsafemap;
@@ -10244,10 +10357,7 @@ initiate_write_bmsafemap(bmsafemap, bp)
 	uint8_t *blksfree;
 	struct cg *cgp;
 	struct fs *fs;
-	int cleared;
 	ino_t ino;
-	long bno;
-	int i;
 
 	if (bmsafemap->sm_state & IOSTARTED)
 		panic("initiate_write_bmsafemap: Already started\n");
@@ -10286,25 +10396,9 @@ initiate_write_bmsafemap(bmsafemap, bp)
 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
 		blksfree = cg_blksfree(cgp);
 		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
-			bno = dtogd(fs, jnewblk->jn_blkno);
-			cleared = 0;
-			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
-			    i++) {
-				if (isclr(blksfree, bno + i)) {
-					cleared = 1;
-					setbit(blksfree, bno + i);
-				}
-			}
-			/*
-			 * We may not clear the block if it's a background
-			 * copy.  In that case there is no reason to detach
-			 * it.
-			 */
-			if (cleared) {
-				stat_jnewblk++;
-				jnewblk->jn_state &= ~ATTACHED;
-				jnewblk->jn_state |= UNDONE;
-			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
+			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
+				continue;
+			if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
 				panic("initiate_write_bmsafemap: block %jd "
 				    "marked free", jnewblk->jn_blkno);
 		}
@@ -10578,6 +10672,9 @@ handle_jwork(wkhd)
 		case D_FREEDEP:
 			free_freedep(WK_FREEDEP(wk));
 			continue;
+		case D_FREEFRAG:
+			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
+			WORKITEM_FREE(wk, D_FREEFRAG);
 		case D_FREEWORK:
 			handle_written_freework(WK_FREEWORK(wk));
 			continue;
@@ -11050,6 +11147,58 @@ bmsafemap_rollbacks(bmsafemap)
 }
 
 /*
+ * Re-apply an allocation when a cg write is complete.
+ */
+static int
+jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
+	struct jnewblk *jnewblk;
+	struct fs *fs;
+	struct cg *cgp;
+	uint8_t *blksfree;
+{
+	ufs1_daddr_t fragno;
+	ufs2_daddr_t blkno;
+	long cgbno, bbase;
+	int frags, blk;
+	int i;
+
+	frags = 0;
+	cgbno = dtogd(fs, jnewblk->jn_blkno);
+	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
+		if (isclr(blksfree, cgbno + i))
+			panic("jnewblk_rollforward: re-allocated fragment");
+		frags++;
+	}
+	if (frags == fs->fs_frag) {
+		blkno = fragstoblks(fs, cgbno);
+		ffs_clrblock(fs, blksfree, (long)blkno);
+		ffs_clusteracct(fs, cgp, blkno, -1);
+		cgp->cg_cs.cs_nbfree--;
+	} else {
+		bbase = cgbno - fragnum(fs, cgbno);
+		cgbno += jnewblk->jn_oldfrags;
+                /* If a complete block had been reassembled, account for it. */
+		fragno = fragstoblks(fs, bbase);
+		if (ffs_isblock(fs, blksfree, fragno)) {
+			cgp->cg_cs.cs_nffree += fs->fs_frag;
+			ffs_clusteracct(fs, cgp, fragno, -1);
+			cgp->cg_cs.cs_nbfree--;
+		}
+		/* Decrement the old frags.  */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
+		/* Allocate the fragment */
+		for (i = 0; i < frags; i++)
+			clrbit(blksfree, cgbno + i);
+		cgp->cg_cs.cs_nffree -= frags;
+		/* Add back in counts associated with the new frags */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+	}
+	return (frags);
+}
+
+/*
  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
  * changes if it's not a background write.  Set all written dependencies 
  * to DEPCOMPLETE and free the structure if possible.
@@ -11069,9 +11218,7 @@ handle_written_bmsafemap(bmsafemap, bp)
 	struct cg *cgp;
 	struct fs *fs;
 	ino_t ino;
-	long bno;
 	int chgs;
-	int i;
 
 	if ((bmsafemap->sm_state & IOSTARTED) == 0)
 		panic("initiate_write_bmsafemap: Not started\n");
@@ -11121,18 +11268,9 @@ handle_written_bmsafemap(bmsafemap, bp)
 		    jntmp) {
 			if ((jnewblk->jn_state & UNDONE) == 0)
 				continue;
-			bno = dtogd(fs, jnewblk->jn_blkno);
-			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
-			    i++) {
-				if (bp->b_xflags & BX_BKGRDMARKER)
-					break;
-				if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
-				    isclr(blksfree, bno + i))
-					panic("handle_written_bmsafemap: "
-					    "re-allocated fragment");
-				clrbit(blksfree, bno + i);
+			if ((bp->b_xflags & BX_BKGRDMARKER) == 0 &&
+			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
 				chgs = 1;
-			}
 			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
 			jnewblk->jn_state |= ATTACHED;
 			free_jnewblk(jnewblk);
@@ -11826,6 +11964,11 @@ softdep_sync_metadata(struct vnode *vp)
 	 * truncations are started, and inode references are journaled.
 	 */
 	ACQUIRE_LOCK(&lk);
+	/*
+	 * Write all journal records to prevent rollbacks on devvp.
+	 */
+	if (vp->v_type == VCHR)
+		softdep_flushjournal(vp->v_mount);
 	error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number);
 	/*
 	 * Ensure that all truncates are written so we won't find deps on
@@ -11965,6 +12108,8 @@ top:
 			continue;
 
 		case D_FREEWORK:
+		case D_FREEDEP:
+		case D_JSEGDEP:
 			continue;
 
 		default:
@@ -12393,33 +12538,36 @@ softdep_request_cleanup(fs, vp, cred, resource)
 	int error;
 
 	mp = vp->v_mount;
-	ump = VTOI(vp)->i_ump;
+	ump = VFSTOUFS(mp);
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	if (resource == FLUSH_BLOCKS_WAIT)
 		stat_cleanup_blkrequests += 1;
 	else
 		stat_cleanup_inorequests += 1;
+
 	/*
 	 * If we are being called because of a process doing a
-	 * copy-on-write, then it is not safe to update the vnode
-	 * as we may recurse into the copy-on-write routine.
+	 * copy-on-write, then it is not safe to process any
+	 * worklist items as we will recurse into the copyonwrite
+	 * routine.  This will result in an incoherent snapshot.
 	 */
-	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
-		UFS_UNLOCK(ump);
-		error = ffs_update(vp, 1);
+	if (curthread->td_pflags & TDP_COWINPROGRESS)
+		return (0);
+	UFS_UNLOCK(ump);
+	error = ffs_update(vp, 1);
+	if (error != 0) {
 		UFS_LOCK(ump);
-		if (error != 0)
-			return (0);
+		return (0);
 	}
 	/*
 	 * If we are in need of resources, consider pausing for
 	 * tickdelay to give ourselves some breathing room.
 	 */
-	UFS_UNLOCK(ump);
 	ACQUIRE_LOCK(&lk);
+	process_removes(vp);
+	process_truncates(vp);
 	request_cleanup(UFSTOVFS(ump), resource);
 	FREE_LOCK(&lk);
-	UFS_LOCK(ump);
 	/*
 	 * Now clean up at least as many resources as we will need.
 	 *
@@ -12451,29 +12599,23 @@ softdep_request_cleanup(fs, vp, cred, resource)
 			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
 			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
 	} else {
+		UFS_LOCK(ump);
 		printf("softdep_request_cleanup: Unknown resource type %d\n",
 		    resource);
 		return (0);
 	}
 	starttime = time_second;
 retry:
-	while ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
-		fs->fs_cstotal.cs_nbfree <= needed) ||
-	       (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
-		fs->fs_cstotal.cs_nifree <= needed)) {
-		UFS_UNLOCK(ump);
+	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
+	    fs->fs_cstotal.cs_nbfree <= needed) ||
+	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
+	    fs->fs_cstotal.cs_nifree <= needed)) {
 		ACQUIRE_LOCK(&lk);
-		process_removes(vp);
-		process_truncates(vp);
 		if (ump->softdep_on_worklist > 0 &&
-		    process_worklist_item(UFSTOVFS(ump), 1, LK_NOWAIT) != 0) {
+		    process_worklist_item(UFSTOVFS(ump),
+		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
 			stat_worklist_push += 1;
-			FREE_LOCK(&lk);
-			UFS_LOCK(ump);
-			continue;
-		}
 		FREE_LOCK(&lk);
-		UFS_LOCK(ump);
 	}
 	/*
 	 * If we still need resources and there are no more worklist
@@ -12487,7 +12629,6 @@ retry:
 	     fs->fs_cstotal.cs_nbfree <= needed) ||
 	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
 	     fs->fs_cstotal.cs_nifree <= needed)) {
-		UFS_UNLOCK(ump);
 		MNT_ILOCK(mp);
 		MNT_VNODE_FOREACH(lvp, mp, mvp) {
 			VI_LOCK(lvp);
@@ -12516,7 +12657,6 @@ retry:
 			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
 			VOP_UNLOCK(lvp, 0);
 		}
-		UFS_LOCK(ump);
 		if (ump->softdep_on_worklist > 0) {
 			stat_cleanup_retries += 1;
 			goto retry;
@@ -12525,6 +12665,7 @@ retry:
 	}
 	if (time_second - starttime > stat_cleanup_high_delay)
 		stat_cleanup_high_delay = time_second - starttime;
+	UFS_LOCK(ump);
 	return (1);
 }
 
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index b0f2d7e..35852bf 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -132,8 +132,8 @@ static struct buf_ops ffs_ops = {
  */
 static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
     "noclusterw", "noexec", "export", "force", "from", "groupquota",
-    "multilabel", "nfsv4acls", "snapshot", "nosuid", "suiddir", "nosymfollow",
-    "sync", "union", "userquota", NULL };
+    "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir",
+    "nosymfollow", "sync", "union", "userquota", NULL };
 
 static int
 ffs_mount(struct mount *mp)
@@ -142,6 +142,7 @@ ffs_mount(struct mount *mp)
 	struct thread *td;
 	struct ufsmount *ump = 0;
 	struct fs *fs;
+	pid_t fsckpid = 0;
 	int error, flags;
 	u_int mntorflags;
 	accmode_t accmode;
@@ -184,6 +185,29 @@ ffs_mount(struct mount *mp)
 		vfs_deleteopt(mp->mnt_opt, "snapshot");
 	}
 
+	if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 &&
+	    vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) {
+		/*
+		 * Once we have set the restricted PID, do not
+		 * persist "fsckpid" in the options list.
+		 */
+		vfs_deleteopt(mp->mnt_optnew, "fsckpid");
+		vfs_deleteopt(mp->mnt_opt, "fsckpid");
+		if (mp->mnt_flag & MNT_UPDATE) {
+			if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 &&
+			     vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
+				printf("Checker enable: Must be read-only\n");
+				return (EINVAL);
+			}
+		} else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
+			printf("Checker enable: Must be read-only\n");
+			return (EINVAL);
+		}
+		/* Set to -1 if we are done */
+		if (fsckpid == 0)
+			fsckpid = -1;
+	}
+
 	if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) {
 		if (mntorflags & MNT_ACLS) {
 			printf("WARNING: \"acls\" and \"nfsv4acls\" "
@@ -204,6 +228,20 @@ ffs_mount(struct mount *mp)
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
 		devvp = ump->um_devvp;
+		if (fsckpid == -1 && ump->um_fsckpid > 0) {
+			if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 ||
+			    (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0)
+				return (error);
+			DROP_GIANT();
+			g_topology_lock();
+			/*
+			 * Return to normal read-only mode.
+			 */
+			error = g_access(ump->um_cp, 0, -1, 0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			ump->um_fsckpid = 0;
+		}
 		if (fs->fs_ronly == 0 &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
@@ -273,7 +311,10 @@ ffs_mount(struct mount *mp)
 				softdep_unmount(mp);
 			DROP_GIANT();
 			g_topology_lock();
-			g_access(ump->um_cp, 0, -1, 0);
+			/*
+			 * Drop our write and exclusive access.
+			 */
+			g_access(ump->um_cp, 0, -1, -1);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			fs->fs_ronly = 1;
@@ -292,6 +333,13 @@ ffs_mount(struct mount *mp)
 		if (fs->fs_ronly &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
+			 * If we are running a checker, do not allow upgrade.
+			 */
+			if (ump->um_fsckpid > 0) {
+				printf("Active checker, cannot rw upgrade\n");
+				return (EINVAL);
+			}
+			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
@@ -327,13 +375,9 @@ ffs_mount(struct mount *mp)
 			DROP_GIANT();
 			g_topology_lock();
 			/*
-			 * If we're the root device, we may not have an E count
-			 * yet, get it now.
+			 * Request exclusive write access.
 			 */
-			if (ump->um_cp->ace == 0)
-				error = g_access(ump->um_cp, 0, 1, 1);
-			else
-				error = g_access(ump->um_cp, 0, 1, 0);
+			error = g_access(ump->um_cp, 0, 1, 1);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error)
@@ -389,6 +433,39 @@ ffs_mount(struct mount *mp)
 			mp->mnt_flag |= MNT_NFS4ACLS;
 			MNT_IUNLOCK(mp);
 		}
+		/*
+		 * If this is a request from fsck to clean up the filesystem,
+		 * then allow the specified pid to proceed.
+		 */
+		if (fsckpid > 0) {
+			if (ump->um_fsckpid != 0) {
+				printf("Active checker already running on %s\n",
+				    fs->fs_fsmnt);
+				return (EINVAL);
+			}
+			KASSERT((mp->mnt_flag & MNT_SOFTDEP) == 0,
+			    ("soft updates enabled on read-only file system"));
+			DROP_GIANT();
+			g_topology_lock();
+			/*
+			 * Request write access.
+			 */
+			error = g_access(ump->um_cp, 0, 1, 0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			if (error) {
+				printf("Checker activation failed on %s\n",
+				    fs->fs_fsmnt);
+				return (error);
+			}
+			ump->um_fsckpid = fsckpid;
+			if (fs->fs_snapinum[0] != 0)
+				ffs_snapshot_mount(mp);
+			fs->fs_mtime = time_second;
+			fs->fs_fmod = 1;
+			fs->fs_clean = 0;
+			(void) ffs_sbupdate(ump, MNT_WAIT, 0);
+		}
 
 		/*
 		 * If this is a snapshot request, take the snapshot.
@@ -452,6 +529,31 @@ ffs_mount(struct mount *mp)
 			vrele(devvp);
 			return (error);
 		}
+		if (fsckpid > 0) {
+			KASSERT((mp->mnt_flag & MNT_SOFTDEP) == 0,
+			    ("soft updates enabled on read-only file system"));
+			ump = VFSTOUFS(mp);
+			fs = ump->um_fs;
+			DROP_GIANT();
+			g_topology_lock();
+			/*
+			 * Request write access.
+			 */
+			error = g_access(ump->um_cp, 0, 1, 0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			if (error) {
+				printf("Checker activation failed on %s\n",
+				    fs->fs_fsmnt);
+			} else { 
+				ump->um_fsckpid = fsckpid;
+				if (fs->fs_snapinum[0] != 0)
+					ffs_snapshot_mount(mp);
+				fs->fs_mtime = time_second;
+				fs->fs_clean = 0;
+				(void) ffs_sbupdate(ump, MNT_WAIT, 0);
+			}
+		}
 	}
 	vfs_mountedfrom(mp, fspec);
 	return (0);
@@ -665,13 +767,6 @@ ffs_mountfs(devvp, mp, td)
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
-
-	/*
-	 * If we are a root mount, drop the E flag so fsck can do its magic.
-	 * We will pick it up again when we remount R/W.
-	 */
-	if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
-		error = g_access(cp, 0, 0, -1);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0);
@@ -932,7 +1027,7 @@ ffs_mountfs(devvp, mp, td)
 	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 
-	if( mp->mnt_flag & MNT_ROOTFS) {
+	if (mp->mnt_flag & MNT_ROOTFS) {
 		/*
 		 * Root mount; update timestamp in mount structure.
 		 * this will be used by the common root mount code
@@ -1169,7 +1264,7 @@ ffs_unmount(mp, mntflags)
 	}
 	UFS_UNLOCK(ump);
 	softdep_unmount(mp);
-	if (fs->fs_ronly == 0) {
+	if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT, 0);
 		if (error && error != ENXIO) {
@@ -1183,6 +1278,13 @@ ffs_unmount(mp, mntflags)
 	}
 	DROP_GIANT();
 	g_topology_lock();
+	if (ump->um_fsckpid > 0) {
+		/*
+		 * Return to normal read-only mode.
+		 */
+		error = g_access(ump->um_cp, 0, -1, 0);
+		ump->um_fsckpid = 0;
+	}
 	g_vfs_close(ump->um_cp);
 	g_topology_unlock();
 	PICKUP_GIANT();
@@ -1331,7 +1433,7 @@ ffs_sync(mp, waitfor)
 
 	td = curthread;
 	fs = ump->um_fs;
-	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
+	if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) {
 		printf("fs = %s\n", fs->fs_fsmnt);
 		panic("ffs_sync: rofs mod");
 	}
@@ -1689,12 +1791,12 @@ ffs_uninit(vfsp)
  * Write a superblock and associated information back to disk.
  */
 int
-ffs_sbupdate(mp, waitfor, suspended)
-	struct ufsmount *mp;
+ffs_sbupdate(ump, waitfor, suspended)
+	struct ufsmount *ump;
 	int waitfor;
 	int suspended;
 {
-	struct fs *fs = mp->um_fs;
+	struct fs *fs = ump->um_fs;
 	struct buf *sbbp;
 	struct buf *bp;
 	int blks;
@@ -1702,14 +1804,14 @@ ffs_sbupdate(mp, waitfor, suspended)
 	int i, size, error, allerror = 0;
 
 	if (fs->fs_ronly == 1 &&
-	    (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
-	    (MNT_RDONLY | MNT_UPDATE))
+	    (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
+	    (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0)
 		panic("ffs_sbupdate: write read-only filesystem");
 	/*
 	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
 	 */
-	sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize,
-	    0, 0, 0);
+	sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
+	    (int)fs->fs_sbsize, 0, 0, 0);
 	/*
 	 * First write back the summary information.
 	 */
@@ -1719,7 +1821,7 @@ ffs_sbupdate(mp, waitfor, suspended)
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
-		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
+		bp = getblk(ump->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
 		    size, 0, 0, 0);
 		bcopy(space, bp->b_data, (u_int)size);
 		space = (char *)space + size;
@@ -1755,9 +1857,9 @@ ffs_sbupdate(mp, waitfor, suspended)
 	fs->fs_fmod = 0;
 	fs->fs_time = time_second;
 	if (fs->fs_flags & FS_DOSOFTDEP)
-		softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp);
+		softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp);
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
-	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
+	ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
 	if (suspended)
 		bp->b_flags |= B_VALIDSUSPWRT;
 	if (waitfor != MNT_WAIT)
diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h
index a0b8e5b..b1e2174 100644
--- a/sys/ufs/ffs/fs.h
+++ b/sys/ufs/ffs/fs.h
@@ -214,7 +214,9 @@
 #define FFS_SET_CWD		12	/* set current directory */
 #define	FFS_SET_DOTDOT		13	/* set inode number for ".." */
 #define	FFS_UNLINK		14	/* remove a name in the filesystem */
-#define	FFS_MAXID		15	/* number of valid ffs ids */
+#define	FFS_SET_INODE		15	/* update an on-disk inode */
+#define	FFS_SET_BUFOUTPUT	16	/* set buffered writing on descriptor */
+#define	FFS_MAXID		16	/* number of valid ffs ids */
 
 /*
  * Command structure passed in to the filesystem to adjust filesystem values.
diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h
index 80c7315..b251ba8 100644
--- a/sys/ufs/ffs/softdep.h
+++ b/sys/ufs/ffs/softdep.h
@@ -536,8 +536,7 @@ struct freeblks {
 #endif
 	uint64_t fb_modrev;		/* Inode revision at start of trunc. */
 	off_t	fb_len;			/* Length we're truncating to. */
-	ufs2_daddr_t fb_chkcnt;		/* Expected blks released. */
-	ufs2_daddr_t fb_freecnt;	/* Actual blocks released. */
+	ufs2_daddr_t fb_chkcnt;		/* Blocks released. */
 	ino_t	fb_inum;		/* inode owner of blocks */
 	enum	vtype fb_vtype;		/* inode owner's file type */
 	uid_t	fb_uid;			/* uid of previous owner of blocks */
diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c
index 7ddbe97..129c26d 100644
--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@@ -120,15 +120,14 @@ ufs_inactive(ap)
 	isize = ip->i_size;
 	if (ip->i_ump->um_fstype == UFS2)
 		isize += ip->i_din2->di_extsize;
-	if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip)) {
+	if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip))
+		error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL,
+		    NOCRED, td);
+	if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) {
 #ifdef QUOTA
 		if (!getinoquota(ip))
 			(void)chkiq(ip, -1, NOCRED, FORCE);
 #endif
-		error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL,
-		    NOCRED, td);
-	}
-	if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) {
 #ifdef UFS_EXTATTR
 		ufs_extattr_vnode_inactive(vp, td);
 #endif
diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h
index 7874105..6447dce 100644
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@@ -77,6 +77,7 @@ struct ufsmount {
 	u_long	um_bptrtodb;			/* indir ptr to disk block */
 	u_long	um_seqinc;			/* inc between seq blocks */
 	struct	mtx um_lock;			/* Protects ufsmount & fs */
+	pid_t	um_fsckpid;			/* PID permitted fsck sysctls */
 	long	um_numindirdeps;		/* outstanding indirdeps */
 	struct	workhead softdep_workitem_pending; /* softdep work queue */
 	struct	worklist *softdep_worklist_tail; /* Tail pointer for above */