Add snapshots to the fast filesystem. Most of the changes support

the gating of system calls that cause modifications to the underlying filesystem. The gating can be enabled by any filesystem that needs to consistently suspend operations by adding the vop_stdgetwritemount to their set of vnops. Once gating is enabled, the function vfs_write_suspend stops all new write operations to a filesystem, allows any filesystem modifying system calls already in progress to complete, then sync's the filesystem to disk and returns. The function vfs_write_resume allows the suspended write operations to begin again. Gating is not added by default for all filesystems as for SMP systems it adds two extra locks to such critical kernel paths as the write system call. Thus, gating should only be added as needed. Details on the use and current status of snapshots in FFS can be found in /sys/ufs/ffs/README.snapshot so for brevity and timelyness is not included here. Unless and until you create a snapshot file, these changes should have no effect on your system (famous last words).
author: mckusick <mckusick@FreeBSD.org> 2000-07-11 22:07:57 +0000
committer: mckusick <mckusick@FreeBSD.org> 2000-07-11 22:07:57 +0000
commit: a3d0c189ea25a7af3dfab30112f5d8d65e214e1c (patch)
tree: c84458dcf49aaf90ff010ebc108cb3b6ca3c2f4a /sys/ufs/ffs/ffs_snapshot.c
parent: c8c04452402a28eabd1ed8a1a06e0a14ac3d22c6 (diff)
download: FreeBSD-src-a3d0c189ea25a7af3dfab30112f5d8d65e214e1c.zip
FreeBSD-src-a3d0c189ea25a7af3dfab30112f5d8d65e214e1c.tar.gz
1 files changed, 1028 insertions, 0 deletions
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
new file mode 100644
index 0000000..73da537
--- /dev/null
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -0,0 +1,1028 @@
+/*
+ * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
+ *
+ * Further information about snapshots can be obtained from:
+ *
+ *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
+ *	1614 Oxford Street		mckusick@mckusick.com
+ *	Berkeley, CA 94709-1608		+1-510-843-9542
+ *	USA
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_snapshot.c	8.10 (McKusick) 7/11/00
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/vnode.h>
+
+#include <ufs/ufs/extattr.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#define KERNCRED proc0.p_ucred
+#define CURPROC curproc
+#define DEBUG
+
+static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t,
+	int, int, int, int));
+static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *));
+static int readblock __P((struct buf *, daddr_t));
+
+#ifdef DEBUG
+#include <sys/sysctl.h>
+int snapdebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
+#endif /* DEBUG */
+
+/*
+ * Create a snapshot file and initialize it for the filesystem.
+ */
+int
+ffs_snapshot(mp, snapfile)
+	struct mount *mp;
+	char *snapfile;
+{
+	ufs_daddr_t rlbn;
+	ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP];
+	int error, cg, snaploc, indiroff, numblks;
+	int i, size, base, len, loc, inoblkcnt;
+	int blksperindir, flag = mp->mnt_flag;
+	struct fs *fs = VFSTOUFS(mp)->um_fs;
+	struct proc *p = CURPROC;
+	struct inode *devip, *ip, *xp;
+	struct buf *bp, *nbp, *ibp;
+	struct vnode *vp, *devvp;
+	struct nameidata nd;
+	struct mount *wrtmp;
+	struct dinode *dip;
+	struct vattr vat;
+	struct cg *cgp;
+
+	/*
+	 * Need to serialize access to snapshot code per filesystem.
+	 */
+	/*
+	 * Assign a snapshot slot in the superblock.
+	 */
+	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+		if (fs->fs_snapinum[snaploc] == 0)
+			break;
+	if (snaploc == FSMAXSNAP)
+		return (ENOSPC);
+	/*
+	 * Create the snapshot file.
+	 */
+restart:
+	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	if (nd.ni_vp != NULL) {
+		vput(nd.ni_vp);
+		error = EEXIST;
+	}
+	if (nd.ni_dvp->v_mount != mp)
+		error = EXDEV;
+	if (error) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		return (error);
+	}
+	VATTR_NULL(&vat);
+	vat.va_type = VREG;
+	vat.va_mode = S_IRUSR;
+	vat.va_vaflags |= VA_EXCLUSIVE;
+	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
+		wrtmp = NULL;
+	if (wrtmp != mp)
+		panic("ffs_snapshot: mount mismatch");
+	if (vn_start_write(wrtmp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(wrtmp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE);
+	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
+	vput(nd.ni_dvp);
+	if (error) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vn_finished_write(wrtmp);
+		return (error);
+	}
+	vp = nd.ni_vp;
+	ip = VTOI(vp);
+	devvp = ip->i_devvp;
+	devip = VTOI(devvp);
+	/*
+	 * Allocate and copy the last block contents so as to be able
+	 * to set size to that of the filesystem.
+	 */
+	numblks = howmany(fs->fs_size, fs->fs_frag);
+	error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
+	    fs->fs_bsize, KERNCRED, B_CLRBUF, &bp);
+	if (error)
+		goto out;
+	ip->i_size = lblktosize(fs, (off_t)numblks);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	if ((error = readblock(bp, numblks - 1)) != 0)
+		goto out;
+	bawrite(bp);
+	/*
+	 * Preallocate critical data structures so that we can copy
+	 * them in without further allocation after we suspend all
+	 * operations on the filesystem. We would like to just release
+	 * the allocated buffers without writing them since they will
+	 * be filled in below once we are ready to go, but this upsets
+	 * the soft update code, so we go ahead and write the new buffers.
+	 *
+	 * Allocate all indirect blocks. Also allocate shadow copies
+	 * for each of the indirect blocks.
+	 */
+	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
+		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
+		if (error)
+			goto out;
+		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
+		bdwrite(ibp);
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
+		    fs->fs_bsize, p->p_ucred, 0, &nbp);
+		if (error)
+			goto out;
+		bawrite(nbp);
+	}
+	/*
+	 * Allocate shadow blocks to copy all of the other snapshot inodes
+	 * so that we will be able to expunge them from this snapshot.
+	 */
+	for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) {
+		blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc]));
+		for (i = 0; i < inoblkcnt; i++)
+			if (inoblks[i] == blkno)
+				break;
+		if (i == inoblkcnt) {
+			inoblks[inoblkcnt++] = blkno;
+			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
+			    fs->fs_bsize, KERNCRED, 0, &nbp);
+			if (error)
+				goto out;
+			bawrite(nbp);
+		}
+	}
+	/*
+	 * Allocate all cylinder group blocks.
+	 */
+	for (cg = 0; cg < fs->fs_ncg; cg++) {
+		error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
+		    fs->fs_bsize, KERNCRED, 0, &nbp);
+		if (error)
+			goto out;
+		bawrite(nbp);
+	}
+	/*
+	 * Allocate copies for the superblock and its summary information.
+	 */
+	error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED,
+	    0, &nbp);
+	if (error)
+		goto out;
+	bawrite(nbp);
+	blkno = fragstoblks(fs, fs->fs_csaddr);
+	len = howmany(fs->fs_cssize, fs->fs_bsize);
+	for (loc = 0; loc < len; loc++) {
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
+		    fs->fs_bsize, KERNCRED, 0, &nbp);
+		if (error)
+			goto out;
+		bawrite(nbp);
+	}
+	/*
+	 * Change inode to snapshot type file.
+	 */
+	ip->i_flags |= SF_IMMUTABLE | SF_SNAPSHOT;
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	/*
+	 * Ensure that the snapshot is completely on disk.
+	 */
+	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0)
+		goto out;
+	/*
+	 * All allocations are done, so we can now snapshot the system.
+	 *
+	 * Suspend operation on filesystem.
+	 */
+	for (;;) {
+		vn_finished_write(wrtmp);
+		vfs_write_suspend(vp->v_mount);
+		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
+			break;
+		vn_start_write(wrtmp, V_WAIT);
+	}
+	/*
+	 * First, copy all the cylinder group maps. All the unallocated
+	 * blocks are marked BLK_NOCOPY so that the snapshot knows that
+	 * it need not copy them if they are later written.
+	 */
+	len = howmany(fs->fs_fpg, fs->fs_frag);
+	for (cg = 0; cg < fs->fs_ncg; cg++) {
+		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+			(int)fs->fs_cgsize, KERNCRED, &bp);
+		if (error) {
+			brelse(bp);
+			goto out1;
+		}
+		cgp = (struct cg *)bp->b_data;
+		if (!cg_chkmagic(cgp)) {
+			brelse(bp);
+			error = EIO;
+			goto out1;
+		}
+		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
+			KERNCRED, &nbp);
+		if (error) {
+			brelse(bp);
+			brelse(nbp);
+			goto out1;
+		}
+		bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
+		if (fs->fs_cgsize < fs->fs_bsize)
+			bzero(&nbp->b_data[fs->fs_cgsize],
+			    fs->fs_bsize - fs->fs_cgsize);
+		bawrite(nbp);
+		base = cg * fs->fs_fpg / fs->fs_frag;
+		if (base + len > numblks)
+			len = numblks - base;
+		loc = 0;
+		if (base < NDADDR) {
+			for ( ; loc < NDADDR; loc++) {
+				if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
+					continue;
+				ip->i_db[loc] = BLK_NOCOPY;
+			}
+		}
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
+		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+		if (error) {
+			brelse(bp);
+			goto out1;
+		}
+		indiroff = (base + loc - NDADDR) % NINDIR(fs);
+		for ( ; loc < len; loc++, indiroff++) {
+			if (indiroff >= NINDIR(fs)) {
+				bawrite(ibp);
+				error = VOP_BALLOC(vp,
+				    lblktosize(fs, (off_t)(base + loc)),
+				    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+				if (error) {
+					brelse(bp);
+					goto out1;
+				}
+				indiroff = 0;
+			}
+			if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
+				continue;
+			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
+		}
+		brelse(bp);
+		bdwrite(ibp);
+	}
+	/*
+	 * Snapshot the superblock and its summary information.
+	 */
+	error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED,
+	    0, &nbp);
+	if (error)
+		goto out1;
+	bcopy(fs, nbp->b_data, fs->fs_sbsize);
+	((struct fs *)(nbp->b_data))->fs_clean = 1;
+	if (fs->fs_sbsize < fs->fs_bsize)
+		bzero(&nbp->b_data[fs->fs_sbsize],
+		    fs->fs_bsize - fs->fs_sbsize);
+	bawrite(nbp);
+	blkno = fragstoblks(fs, fs->fs_csaddr);
+	len = howmany(fs->fs_cssize, fs->fs_bsize) - 1;
+	size = fs->fs_bsize;
+	for (loc = 0; loc <= len; loc++) {
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
+		    fs->fs_bsize, KERNCRED, 0, &nbp);
+		if (error)
+			goto out1;
+		if (loc == len) {
+			readblock(nbp, blkno + loc);
+			size = fs->fs_cssize % fs->fs_bsize;
+		}
+		bcopy(fs->fs_csp[loc], nbp->b_data, size);
+		bawrite(nbp);
+	}
+	/*
+	 * Copy the shadow blocks for the snapshot inodes so that
+	 * the copies can can be expunged.
+	 */
+	for (loc = 0; loc < inoblkcnt; loc++) {
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]),
+		    fs->fs_bsize, KERNCRED, 0, &nbp);
+		if (error)
+			goto out1;
+		readblock(nbp, inoblks[loc]);
+		bdwrite(nbp);
+	}
+	/*
+	 * Copy allocation information from other snapshots and then
+	 * expunge them from the view of the current snapshot.
+	 */
+	for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) {
+		/*
+		 * Before expunging a snapshot inode, note all the
+		 * blocks that it claims with BLK_SNAP so that fsck will
+		 * be able to account for those blocks properly and so
+		 * that this snapshot knows that it need not copy them
+		 * if the other snapshot holding them is freed.
+		 */
+		if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0)
+			goto out1;
+		blksperindir = 1;
+		lbn = -NDADDR;
+		len = numblks - NDADDR;
+		rlbn = NDADDR;
+		for (i = 0; len > 0 && i < NIADDR; i++) {
+			error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn,
+			    rlbn, len, blksperindir);
+			if (error)
+				goto out1;
+			blksperindir *= NINDIR(fs);
+			lbn -= blksperindir + 1;
+			len -= blksperindir;
+			rlbn += blksperindir;
+		}
+		/*
+		 * Set copied snapshot inode to be a zero length file.
+		 */
+		blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number));
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
+		    fs->fs_bsize, KERNCRED, 0, &nbp);
+		if (error)
+			goto out1;
+		dip = (struct dinode *)nbp->b_data +
+		    ino_to_fsbo(fs, xp->i_number);
+		dip->di_size = 0;
+		dip->di_blocks = 0;
+		dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT);
+		bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
+		bdwrite(nbp);
+	}
+	/*
+	 * Copy all indirect blocks to their shadows (allocated above)
+	 * to avoid deadlock in ffs_copyonwrite.
+	 */
+	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
+		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
+		if (error)
+			goto out1;
+		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
+		brelse(ibp);
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
+		    fs->fs_bsize, p->p_ucred, 0, &nbp);
+		if (error)
+			goto out1;
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
+		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
+		if (error) {
+			brelse(nbp);
+			goto out1;
+		}
+		bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize);
+		brelse(ibp);
+		bawrite(nbp);
+	}
+	/*
+	 * Record snapshot inode. Since this is the newest snapshot,
+	 * it must be placed at the end of the list.
+	 */
+	fs->fs_snapinum[snaploc] = ip->i_number;
+	if (ip->i_copyonwrite != 0)
+		panic("ffs_snapshot: %d already on list", ip->i_number);
+	if (devip->i_copyonwrite == 0) {
+		devvp->v_flag |= VCOPYONWRITE;
+		devip->i_copyonwrite = ip;
+	} else {
+		for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; )
+			xp = xp->i_copyonwrite;
+		xp->i_copyonwrite = ip;
+	}
+	vp->v_flag |= VSYSTEM;
+	/*
+	 * Resume operation on filesystem.
+	 */
+out1:
+	vfs_write_resume(vp->v_mount);
+	vn_start_write(wrtmp, V_WAIT);
+out:
+	mp->mnt_flag = flag;
+	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
+	if (error)
+		vput(vp);
+	else
+		VOP_UNLOCK(vp, 0, p);
+	vn_finished_write(wrtmp);
+	return (error);
+}
+
+/*
+ * Descend an indirect block chain for vnode cancelvp accounting for all
+ * its indirect blocks in snapvp.
+ */ 
+static int
+indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
+	struct vnode *snapvp;
+	struct vnode *cancelvp;
+	int level;
+	ufs_daddr_t blkno;
+	int lbn;
+	int rlbn;
+	int remblks;
+	int blksperindir;
+{
+	int subblksperindir, error, last, num, i;
+	struct indir indirs[NIADDR + 2];
+	ufs_daddr_t *bap;
+	struct buf *bp;
+	struct fs *fs;
+
+	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
+		return (error);
+	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
+		panic("indiracct: botched params");
+	/*
+	 * We have to expand bread here since it will deadlock looking
+	 * up the block number for any blocks that are not in the cache.
+	 */
+	fs = VTOI(cancelvp)->i_fs;
+	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
+	bp->b_blkno = fsbtodb(fs, blkno);
+	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
+	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
+		brelse(bp);
+		return (error);
+	}
+	/*
+	 * Account for the block pointers in this indirect block.
+	 */
+	last = howmany(remblks, blksperindir);
+	if (last > NINDIR(fs))
+		last = NINDIR(fs);
+	if (snapvp != cancelvp) {
+		bap = (ufs_daddr_t *)bp->b_data;
+	} else {
+		MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
+		bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
+		brelse(bp);
+	}
+	error = snapacct(snapvp, &bap[0], &bap[last]);
+	if (error || level == 0)
+		goto out;
+	/*
+	 * Account for the block pointers in each of the indirect blocks
+	 * in the levels below us.
+	 */
+	subblksperindir = blksperindir / NINDIR(fs);
+	for (lbn++, level--, i = 0; i < last; i++) {
+		error = indiracct(snapvp, cancelvp, level, bap[i], lbn,
+		    rlbn, remblks, subblksperindir);
+		if (error)
+			goto out;
+		rlbn += blksperindir;
+		lbn -= blksperindir;
+		remblks -= blksperindir;
+	}
+out:
+	if (snapvp != cancelvp)
+		brelse(bp);
+	else
+		FREE(bap, M_DEVBUF);
+	return (error);
+}
+
+/*
+ * Account for a set of blocks allocated in a snapshot inode.
+ */
+static int
+snapacct(vp, oldblkp, lastblkp)
+	struct vnode *vp;
+	ufs_daddr_t *oldblkp, *lastblkp;
+{
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	ufs_daddr_t lbn, blkno, *blkp;
+	struct buf *ibp;
+	int error;
+
+	for ( ; oldblkp < lastblkp; oldblkp++) {
+		blkno = *oldblkp;
+		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
+			continue;
+		lbn = fragstoblks(fs, blkno);
+		if (lbn < NDADDR) {
+			blkp = &ip->i_db[lbn];
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		} else {
+			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
+			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+			if (error)
+				return (error);
+			blkp = &((ufs_daddr_t *)(ibp->b_data))
+			    [(lbn - NDADDR) % NINDIR(fs)];
+		}
+		if (*blkp != 0)
+			panic("snapacct: bad block");
+		*blkp = BLK_SNAP;
+		if (lbn >= NDADDR)
+			bdwrite(ibp);
+	}
+	return (0);
+}
+
+/*
+ * Prepare a snapshot file for being removed.
+ */
+void
+ffs_snapremove(vp)
+	struct vnode *vp;
+{
+	struct inode *ip, *xp;
+	struct vnode *devvp;
+	struct buf *ibp;
+	struct fs *fs;
+	ufs_daddr_t blkno, dblk;
+	int error, snaploc, loc, last;
+
+	ip = VTOI(vp);
+	fs = ip->i_fs;
+	/*
+	 * Delete snapshot inode from superblock. Keep list dense.
+	 */
+	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+		if (fs->fs_snapinum[snaploc] == ip->i_number)
+			break;
+	if (snaploc < FSMAXSNAP) {
+		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
+			if (fs->fs_snapinum[snaploc] == 0)
+				break;
+			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
+		}
+		fs->fs_snapinum[snaploc - 1] = 0;
+	}
+	/*
+	 * Delete from incore list.
+	 * Clear copy-on-write flag if last snapshot.
+	 */
+	devvp = ip->i_devvp;
+	for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) {
+		if (xp->i_copyonwrite != ip)
+			continue;
+		xp->i_copyonwrite = ip->i_copyonwrite;
+		ip->i_copyonwrite = 0;
+		break;
+	}
+	if (xp == 0) {
+		printf("ffs_snapremove: lost snapshot vnode %d\n",
+		    ip->i_number);
+		vref(vp);
+	}
+	if (VTOI(devvp)->i_copyonwrite == 0)
+		devvp->v_flag &= ~VCOPYONWRITE;
+	/*
+	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
+	 * snapshots that want them (see ffs_snapblkfree below).
+	 */
+	for (blkno = 1; blkno < NDADDR; blkno++) {
+		dblk = ip->i_db[blkno];
+		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
+		    (dblk == blkstofrags(fs, blkno) &&
+		     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
+			ip->i_db[blkno] = 0;
+	}
+	for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) {
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
+		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+		if (error)
+			continue;
+		if ((last = fs->fs_size - blkno) > NINDIR(fs))
+			last = NINDIR(fs);
+		for (loc = 0; loc < last; loc++) {
+			dblk = ((ufs_daddr_t *)(ibp->b_data))[loc];
+			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
+			    (dblk == blkstofrags(fs, blkno) &&
+			     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
+				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
+		}
+		bawrite(ibp);
+	}
+	/*
+	 * Clear snapshot flag and drop reference.
+	 */
+	ip->i_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	vrele(vp);
+}
+
+/*
+ * Notification that a block is being freed. Return zero if the free
+ * should be allowed to proceed. Return non-zero if the snapshot file
+ * wants to claim the block. The block will be claimed if it is an
+ * uncopied part of one of the snapshots. It will be freed if it is
+ * either a BLK_NOCOPY or has already been copied in all of the snapshots.
+ * If a fragment is being freed, then all snapshots that care about
+ * it must make a copy since a snapshot file can only claim full sized
+ * blocks. Note that if more than one snapshot file maps the block,
+ * we can pick one at random to claim it. Since none of the snapshots
+ * can change, we are assurred that they will all see the same unmodified
+ * image. When deleting a snapshot file (see ffs_snapremove above), we
+ * must push any of these claimed blocks to one of the other snapshots
+ * that maps it. These claimed blocks are easily identified as they will
+ * have a block number equal to their logical block number within the
+ * snapshot. A copied block can never have this property because they
+ * must always have been allocated from a BLK_NOCOPY location.
+ */
+int
+ffs_snapblkfree(freeip, bno, size)
+	struct inode *freeip;
+	ufs_daddr_t bno;
+	long size;
+{
+	struct buf *ibp, *cbp, *savedcbp = 0;
+	struct fs *fs = freeip->i_fs;
+	struct proc *p = CURPROC;
+	struct inode *ip;
+	struct vnode *vp;
+	ufs_daddr_t lbn, blkno;
+	int indiroff = 0, error = 0, claimedblk = 0;
+
+	lbn = fragstoblks(fs, bno);
+	for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip;
+	     ip = ip->i_copyonwrite) {
+		vp = ITOV(ip);
+		/*
+		 * Lookup block being written.
+		 */
+		if (lbn < NDADDR) {
+			blkno = ip->i_db[lbn];
+		} else {
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			p->p_flag |= P_COWINPROGRESS;
+			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
+			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+			p->p_flag &= ~P_COWINPROGRESS;
+			VOP_UNLOCK(vp, 0, p);
+			if (error)
+				break;
+			indiroff = (lbn - NDADDR) % NINDIR(fs);
+			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
+		}
+		/*
+		 * Check to see if block needs to be copied.
+		 */
+		switch (blkno) {
+		/*
+		 * If the snapshot has already copied the block (default),
+		 * or does not care about the block, it is not needed.
+		 */
+		default:
+		case BLK_NOCOPY:
+			if (lbn >= NDADDR)
+				brelse(ibp);
+			continue;
+		/*
+		 * No previous snapshot claimed the block, so it will be
+		 * freed and become a BLK_NOCOPY (don't care) for us.
+		 */
+		case BLK_SNAP:
+			if (claimedblk)
+				panic("snapblkfree: inconsistent block type");
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			if (lbn < NDADDR) {
+				ip->i_db[lbn] = BLK_NOCOPY;
+				ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			} else {
+				((ufs_daddr_t *)(ibp->b_data))[indiroff] =
+				    BLK_NOCOPY;
+				bdwrite(ibp);
+			}
+			VOP_UNLOCK(vp, 0, p);
+			continue;
+		/*
+		 * A block that we map is being freed. If it has not been
+		 * claimed yet, we will claim or copy it (below).
+		 */
+		case 0:
+			claimedblk = 1;
+			break;
+		}
+		/*
+		 * If this is a full size block, we will just grab it
+		 * and assign it to the snapshot inode. Otherwise we
+		 * will proceed to copy it. See explanation for this
+		 * routine as to why only a single snapshot needs to
+		 * claim this block.
+		 */
+		if (size == fs->fs_bsize) {
+#ifdef DEBUG
+			if (snapdebug)
+				printf("%s %d lbn %d from inum %d\n",
+				    "Grabonremove: snapino", ip->i_number, lbn,
+				    freeip->i_number);
+#endif
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			if (lbn < NDADDR) {
+				ip->i_db[lbn] = bno;
+			} else {
+				((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno;
+				bdwrite(ibp);
+			}
+			ip->i_blocks += btodb(size);
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			VOP_UNLOCK(vp, 0, p);
+			return (1);
+		}
+		if (lbn >= NDADDR)
+			brelse(ibp);
+		/*
+		 * Allocate the block into which to do the copy. Note that this
+		 * allocation will never require any additional allocations for
+		 * the snapshot inode.
+		 */
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+		p->p_flag |= P_COWINPROGRESS;
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
+		    fs->fs_bsize, KERNCRED, 0, &cbp);
+		p->p_flag &= ~P_COWINPROGRESS;
+		VOP_UNLOCK(vp, 0, p);
+		if (error)
+			break;
+#ifdef DEBUG
+		if (snapdebug)
+			printf("%s%d lbn %d for inum %d size %ld to blkno %d\n",
+			    "Copyonremove: snapino ", ip->i_number, lbn,
+			    freeip->i_number, size, cbp->b_blkno);
+#endif
+		/*
+		 * If we have already read the old block contents, then
+		 * simply copy them to the new block.
+		 */
+		if (savedcbp != 0) {
+			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
+			bawrite(cbp);
+			continue;
+		}
+		/*
+		 * Otherwise, read the old block contents into the buffer.
+		 */
+		if ((error = readblock(cbp, lbn)) != 0)
+			break;
+		savedcbp = cbp;
+	}
+	if (savedcbp)
+		bawrite(savedcbp);
+	/*
+	 * If we have been unable to allocate a block in which to do
+	 * the copy, then return non-zero so that the fragment will
+	 * not be freed. Although space will be lost, the snapshot
+	 * will stay consistent.
+	 */
+	return (error);
+}
+
+/*
+ * Associate snapshot files when mounting.
+ */
+void
+ffs_snapshot_mount(mp)
+	struct mount *mp;
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct proc *p = CURPROC;
+	struct inode *ip, **listtailp;
+	struct vnode *vp;
+	int error, snaploc, loc;
+
+	listtailp = &VTOI(ump->um_devvp)->i_copyonwrite;
+	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
+		if (fs->fs_snapinum[snaploc] == 0)
+			return;
+		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){
+			printf("ffs_snapshot_mount: vget failed %d\n", error);
+			continue;
+		}
+		ip = VTOI(vp);
+		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
+			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
+			    fs->fs_snapinum[snaploc]);
+			vput(vp);
+			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
+				if (fs->fs_snapinum[loc] == 0)
+					break;
+				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
+			}
+			fs->fs_snapinum[loc - 1] = 0;
+			snaploc--;
+			continue;
+		}
+		if (ip->i_copyonwrite != 0)
+			panic("ffs_snapshot_mount: %d already on list",
+			    ip->i_number);
+		*listtailp = ip;
+		listtailp = &ip->i_copyonwrite;
+		vp->v_flag |= VSYSTEM;
+		VOP_UNLOCK(vp, 0, p);
+		ump->um_devvp->v_flag |= VCOPYONWRITE;
+	}
+}
+
+/*
+ * Disassociate snapshot files when unmounting.
+ */
+void
+ffs_snapshot_unmount(mp)
+	struct mount *mp;
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct inode *devip = VTOI(ump->um_devvp);
+	struct inode *xp;
+
+	while ((xp = devip->i_copyonwrite) != 0) {
+		devip->i_copyonwrite = xp->i_copyonwrite;
+		xp->i_copyonwrite = 0;
+		vrele(ITOV(xp));
+	}
+	ump->um_devvp->v_flag &= ~VCOPYONWRITE;
+}
+
+/*
+ * Check for need to copy block that is about to be written,
+ * copying the block if necessary.
+ */
+int
+ffs_copyonwrite(ap)
+	struct vop_copyonwrite_args /* {
+		struct vnode *a_vp;
+		struct buf *a_bp;
+	} */ *ap;
+{
+	struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp;
+	struct fs *fs = VTOI(bp->b_vp)->i_fs;
+	struct proc *p = CURPROC;
+	struct inode *ip;
+	struct vnode *vp;
+	ufs_daddr_t lbn, blkno;
+	int indiroff, error = 0;
+
+	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
+	if (p->p_flag & P_COWINPROGRESS)
+		panic("ffs_copyonwrite: recursive call");
+	for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) {
+		vp = ITOV(ip);
+		/*
+		 * We ensure that everything of our own that needs to be
+		 * copied will be done at the time that ffs_snapshot is
+		 * called. Thus we can skip the check here which can
+		 * deadlock in doing the lookup in VOP_BALLOC.
+		 */
+		if (bp->b_vp == vp)
+			continue;
+		/*
+		 * Check to see if block needs to be copied.
+		 */
+		if (lbn < NDADDR) {
+			blkno = ip->i_db[lbn];
+		} else {
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			p->p_flag |= P_COWINPROGRESS;
+			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
+			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+			p->p_flag &= ~P_COWINPROGRESS;
+			VOP_UNLOCK(vp, 0, p);
+			if (error)
+				break;
+			indiroff = (lbn - NDADDR) % NINDIR(fs);
+			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
+			brelse(ibp);
+		}
+#ifdef DIAGNOSTIC
+		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
+			panic("ffs_copyonwrite: bad copy block");
+#endif
+		if (blkno != 0)
+			continue;
+		/*
+		 * Allocate the block into which to do the copy. Note that this
+		 * allocation will never require any additional allocations for
+		 * the snapshot inode.
+		 */
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+		p->p_flag |= P_COWINPROGRESS;
+		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
+		    fs->fs_bsize, KERNCRED, 0, &cbp);
+		p->p_flag &= ~P_COWINPROGRESS;
+		VOP_UNLOCK(vp, 0, p);
+#ifdef DEBUG
+		if (snapdebug) {
+			printf("Copyonwrite: snapino %d lbn %d for ",
+			    ip->i_number, lbn);
+			if (bp->b_vp == ap->a_vp)
+				printf("fs metadata");
+			else
+				printf("inum %d", VTOI(bp->b_vp)->i_number);
+			printf(" lblkno %d to blkno %d\n", bp->b_lblkno,
+			    cbp->b_blkno);
+		}
+#endif
+		if (error)
+			break;
+		/*
+		 * If we have already read the old block contents, then
+		 * simply copy them to the new block.
+		 */
+		if (savedcbp != 0) {
+			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
+			bawrite(cbp);
+			continue;
+		}
+		/*
+		 * Otherwise, read the old block contents into the buffer.
+		 */
+		if ((error = readblock(cbp, lbn)) != 0)
+			break;
+		savedcbp = cbp;
+	}
+	if (savedcbp)
+		bawrite(savedcbp);
+	return (error);
+}
+
+/*
+ * Read the specified block into the given buffer.
+ * Much of this boiler-plate comes from bwrite().
+ */
+static int
+readblock(bp, lbn)
+	struct buf *bp;
+	daddr_t lbn;
+{
+	struct uio auio;
+	struct iovec aiov;
+	struct proc *p = CURPROC;
+	struct inode *ip = VTOI(bp->b_vp);
+
+	aiov.iov_base = bp->b_data;
+	aiov.iov_len = bp->b_bcount;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
+	auio.uio_resid = bp->b_bcount;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_procp = p;
+	return (physio(ip->i_devvp->v_rdev, &auio, 0));
+}
author	mckusick <mckusick@FreeBSD.org>	2000-07-11 22:07:57 +0000
committer	mckusick <mckusick@FreeBSD.org>	2000-07-11 22:07:57 +0000
commit	a3d0c189ea25a7af3dfab30112f5d8d65e214e1c (patch)
tree	c84458dcf49aaf90ff010ebc108cb3b6ca3c2f4a /sys/ufs/ffs/ffs_snapshot.c
parent	c8c04452402a28eabd1ed8a1a06e0a14ac3d22c6 (diff)
download	FreeBSD-src-a3d0c189ea25a7af3dfab30112f5d8d65e214e1c.zip FreeBSD-src-a3d0c189ea25a7af3dfab30112f5d8d65e214e1c.tar.gz