summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormckusick <mckusick@FreeBSD.org>2000-07-24 05:28:33 +0000
committermckusick <mckusick@FreeBSD.org>2000-07-24 05:28:33 +0000
commitacc66855bf5786e46e7a1f2c9805ca96cc90c681 (patch)
tree293bc9453d98bf984dd4fb4392b47ad92614d53f
parent35aeef29b58a4f4acf20b4a13f1326e85affab9e (diff)
downloadFreeBSD-src-acc66855bf5786e46e7a1f2c9805ca96cc90c681.zip
FreeBSD-src-acc66855bf5786e46e7a1f2c9805ca96cc90c681.tar.gz
This patch corrects the first round of panics and hangs reported
with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
-rw-r--r--sys/fs/cd9660/cd9660_vfsops.c3
-rw-r--r--sys/fs/specfs/spec_vnops.c6
-rw-r--r--sys/gnu/ext2fs/ext2_bmap.c24
-rw-r--r--sys/gnu/fs/ext2fs/ext2_bmap.c24
-rw-r--r--sys/isofs/cd9660/cd9660_vfsops.c3
-rw-r--r--sys/kern/vfs_export.c36
-rw-r--r--sys/kern/vfs_subr.c36
-rw-r--r--sys/kern/vfs_vnops.c14
-rw-r--r--sys/miscfs/specfs/spec_vnops.c6
-rw-r--r--sys/nfs/nfs_common.c3
-rw-r--r--sys/nfs/nfs_subs.c3
-rw-r--r--sys/nfsclient/nfs_subs.c3
-rw-r--r--sys/nfsserver/nfs_srvsubs.c3
-rw-r--r--sys/sys/buf.h5
-rw-r--r--sys/sys/vnode.h5
-rw-r--r--sys/ufs/ffs/ffs_extern.h1
-rw-r--r--sys/ufs/ffs/ffs_snapshot.c67
-rw-r--r--sys/ufs/ffs/ffs_softdep.c91
-rw-r--r--sys/ufs/ffs/ffs_vfsops.c31
-rw-r--r--sys/ufs/mfs/mfs_vfsops.c2
-rw-r--r--sys/ufs/ufs/ufs_bmap.c24
-rw-r--r--sys/ufs/ufs/ufs_inode.c6
-rw-r--r--sys/ufs/ufs/ufs_quota.c2
-rw-r--r--sys/ufs/ufs/ufs_vnops.c3
24 files changed, 298 insertions, 103 deletions
diff --git a/sys/fs/cd9660/cd9660_vfsops.c b/sys/fs/cd9660/cd9660_vfsops.c
index 2b21ba9..50f3883 100644
--- a/sys/fs/cd9660/cd9660_vfsops.c
+++ b/sys/fs/cd9660/cd9660_vfsops.c
@@ -855,7 +855,8 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir)
case VCHR:
case VBLK:
vp->v_op = cd9660_specop_p;
- addaliasu(vp, ip->inode.iso_rdev);
+ vp = addaliasu(vp, ip->inode.iso_rdev);
+ ip->i_vnode = vp;
break;
default:
break;
diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c
index baf40c3..2b1df9e 100644
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@@ -421,9 +421,11 @@ spec_strategy(ap)
bp = ap->a_bp;
vp = ap->a_vp;
if ((bp->b_iocmd == BIO_WRITE)) {
- if (vp->v_mount != NULL &&
- (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
+ if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
+ bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
+ (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
panic("spec_strategy: bad I/O");
+ bp->b_flags &= ~B_VALIDSUSPWRT;
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_start(bp);
if ((vp->v_flag & VCOPYONWRITE) &&
diff --git a/sys/gnu/ext2fs/ext2_bmap.c b/sys/gnu/ext2fs/ext2_bmap.c
index ab4ac52..40fdd65 100644
--- a/sys/gnu/ext2fs/ext2_bmap.c
+++ b/sys/gnu/ext2fs/ext2_bmap.c
@@ -147,7 +147,18 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
num = *nump;
if (num == 0) {
*bnp = blkptrtodb(ump, ip->i_db[bn]);
- if (*bnp == 0) {
+ /*
+ * Since this is FFS independent code, we are out of
+ * scope for the definitions of BLK_NOCOPY and
+ * BLK_SNAP, but we do know that they will fall in
+ * the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts
+ * are made to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & SF_SNAPSHOT) &&
+ ip->i_db[bn] > 0 && ip->i_db[bn] < ump->um_seqinc) {
+ *bnp = -1;
+ } else if (*bnp == 0) {
if (ip->i_flags & SF_SNAPSHOT)
*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
else
@@ -230,6 +241,17 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
if (bp)
bqrelse(bp);
+ /*
+ * Since this is FFS independent code, we are out of scope for the
+ * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+ * will fall in the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts are made
+ * to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){
+ *bnp = -1;
+ return (0);
+ }
*bnp = blkptrtodb(ump, daddr);
if (*bnp == 0) {
if (ip->i_flags & SF_SNAPSHOT)
diff --git a/sys/gnu/fs/ext2fs/ext2_bmap.c b/sys/gnu/fs/ext2fs/ext2_bmap.c
index ab4ac52..40fdd65 100644
--- a/sys/gnu/fs/ext2fs/ext2_bmap.c
+++ b/sys/gnu/fs/ext2fs/ext2_bmap.c
@@ -147,7 +147,18 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
num = *nump;
if (num == 0) {
*bnp = blkptrtodb(ump, ip->i_db[bn]);
- if (*bnp == 0) {
+ /*
+ * Since this is FFS independent code, we are out of
+ * scope for the definitions of BLK_NOCOPY and
+ * BLK_SNAP, but we do know that they will fall in
+ * the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts
+ * are made to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & SF_SNAPSHOT) &&
+ ip->i_db[bn] > 0 && ip->i_db[bn] < ump->um_seqinc) {
+ *bnp = -1;
+ } else if (*bnp == 0) {
if (ip->i_flags & SF_SNAPSHOT)
*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
else
@@ -230,6 +241,17 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
if (bp)
bqrelse(bp);
+ /*
+ * Since this is FFS independent code, we are out of scope for the
+ * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+ * will fall in the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts are made
+ * to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){
+ *bnp = -1;
+ return (0);
+ }
*bnp = blkptrtodb(ump, daddr);
if (*bnp == 0) {
if (ip->i_flags & SF_SNAPSHOT)
diff --git a/sys/isofs/cd9660/cd9660_vfsops.c b/sys/isofs/cd9660/cd9660_vfsops.c
index 2b21ba9..50f3883 100644
--- a/sys/isofs/cd9660/cd9660_vfsops.c
+++ b/sys/isofs/cd9660/cd9660_vfsops.c
@@ -855,7 +855,8 @@ cd9660_vget_internal(mp, ino, vpp, relocated, isodir)
case VCHR:
case VBLK:
vp->v_op = cd9660_specop_p;
- addaliasu(vp, ip->inode.iso_rdev);
+ vp = addaliasu(vp, ip->inode.iso_rdev);
+ ip->i_vnode = vp;
break;
default:
break;
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 0e5ec3f..db16d9f 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -1296,15 +1296,45 @@ bdevvp(dev, vpp)
* how many users there are is inadequate; the v_usecount for
* the vnodes need to be accumulated. vcount() does that.
*/
-void
+struct vnode *
addaliasu(nvp, nvp_rdev)
struct vnode *nvp;
udev_t nvp_rdev;
{
+ struct vnode *ovp;
+ vop_t **ops;
+ dev_t dev;
if (nvp->v_type != VBLK && nvp->v_type != VCHR)
panic("addaliasu on non-special vnode");
- addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
+ dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
+ /*
+ * Check to see if we have a bdevvp vnode with no associated
+ * filesystem. If so, we want to associate the filesystem of
+ * the new newly instigated vnode with the bdevvp vnode and
+ * discard the newly created vnode rather than leaving the
+ * bdevvp vnode lying around with no associated filesystem.
+ */
+ if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
+ addalias(nvp, dev);
+ return (nvp);
+ }
+ /*
+ * Discard unneeded vnode, but save its node specific data.
+ * Note that if there is a lock, it is carried over in the
+ * node specific data to the replacement vnode.
+ */
+ vref(ovp);
+ ovp->v_data = nvp->v_data;
+ ovp->v_tag = nvp->v_tag;
+ nvp->v_data = NULL;
+ ops = nvp->v_op;
+ nvp->v_op = ovp->v_op;
+ ovp->v_op = ops;
+ insmntque(ovp, nvp->v_mount);
+ vrele(nvp);
+ vgone(nvp);
+ return (ovp);
}
void
@@ -1648,7 +1678,7 @@ vclean(vp, flags, p)
*/
if (flags & DOCLOSE) {
if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
- (void) vn_write_suspend_wait(vp, V_WAIT);
+ (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
vinvalbuf(vp, 0, NOCRED, p, 0, 0);
}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 0e5ec3f..db16d9f 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1296,15 +1296,45 @@ bdevvp(dev, vpp)
* how many users there are is inadequate; the v_usecount for
* the vnodes need to be accumulated. vcount() does that.
*/
-void
+struct vnode *
addaliasu(nvp, nvp_rdev)
struct vnode *nvp;
udev_t nvp_rdev;
{
+ struct vnode *ovp;
+ vop_t **ops;
+ dev_t dev;
if (nvp->v_type != VBLK && nvp->v_type != VCHR)
panic("addaliasu on non-special vnode");
- addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
+ dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
+ /*
+ * Check to see if we have a bdevvp vnode with no associated
+ * filesystem. If so, we want to associate the filesystem of
+ * the new newly instigated vnode with the bdevvp vnode and
+ * discard the newly created vnode rather than leaving the
+ * bdevvp vnode lying around with no associated filesystem.
+ */
+ if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
+ addalias(nvp, dev);
+ return (nvp);
+ }
+ /*
+ * Discard unneeded vnode, but save its node specific data.
+ * Note that if there is a lock, it is carried over in the
+ * node specific data to the replacement vnode.
+ */
+ vref(ovp);
+ ovp->v_data = nvp->v_data;
+ ovp->v_tag = nvp->v_tag;
+ nvp->v_data = NULL;
+ ops = nvp->v_op;
+ nvp->v_op = ovp->v_op;
+ ovp->v_op = ops;
+ insmntque(ovp, nvp->v_mount);
+ vrele(nvp);
+ vgone(nvp);
+ return (ovp);
}
void
@@ -1648,7 +1678,7 @@ vclean(vp, flags, p)
*/
if (flags & DOCLOSE) {
if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
- (void) vn_write_suspend_wait(vp, V_WAIT);
+ (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
vinvalbuf(vp, 0, NOCRED, p, 0, 0);
}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 0708f7c..0c4707b 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -729,17 +729,19 @@ vn_start_write(vp, mpp, flags)
* time, these operations are halted until the suspension is over.
*/
int
-vn_write_suspend_wait(vp, flags)
+vn_write_suspend_wait(vp, mp, flags)
struct vnode *vp;
+ struct mount *mp;
int flags;
{
- struct mount *mp;
int error;
- if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
- if (error != EOPNOTSUPP)
- return (error);
- return (0);
+ if (vp != NULL) {
+ if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
+ if (error != EOPNOTSUPP)
+ return (error);
+ return (0);
+ }
}
/*
* If we are not suspended or have not yet reached suspended
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index baf40c3..2b1df9e 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -421,9 +421,11 @@ spec_strategy(ap)
bp = ap->a_bp;
vp = ap->a_vp;
if ((bp->b_iocmd == BIO_WRITE)) {
- if (vp->v_mount != NULL &&
- (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
+ if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
+ bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
+ (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
panic("spec_strategy: bad I/O");
+ bp->b_flags &= ~B_VALIDSUSPWRT;
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_start(bp);
if ((vp->v_flag & VCOPYONWRITE) &&
diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c
index 70e871f..5934465 100644
--- a/sys/nfs/nfs_common.c
+++ b/sys/nfs/nfs_common.c
@@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
}
if (vp->v_type == VCHR || vp->v_type == VBLK) {
vp->v_op = spec_nfsv2nodeop_p;
- addaliasu(vp, rdev);
+ vp = addaliasu(vp, rdev);
+ np->n_vnode = vp;
}
np->n_mtime = mtime.tv_sec;
}
diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c
index 70e871f..5934465 100644
--- a/sys/nfs/nfs_subs.c
+++ b/sys/nfs/nfs_subs.c
@@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
}
if (vp->v_type == VCHR || vp->v_type == VBLK) {
vp->v_op = spec_nfsv2nodeop_p;
- addaliasu(vp, rdev);
+ vp = addaliasu(vp, rdev);
+ np->n_vnode = vp;
}
np->n_mtime = mtime.tv_sec;
}
diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c
index 70e871f..5934465 100644
--- a/sys/nfsclient/nfs_subs.c
+++ b/sys/nfsclient/nfs_subs.c
@@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
}
if (vp->v_type == VCHR || vp->v_type == VBLK) {
vp->v_op = spec_nfsv2nodeop_p;
- addaliasu(vp, rdev);
+ vp = addaliasu(vp, rdev);
+ np->n_vnode = vp;
}
np->n_mtime = mtime.tv_sec;
}
diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c
index 70e871f..5934465 100644
--- a/sys/nfsserver/nfs_srvsubs.c
+++ b/sys/nfsserver/nfs_srvsubs.c
@@ -1282,7 +1282,8 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper)
}
if (vp->v_type == VCHR || vp->v_type == VBLK) {
vp->v_op = spec_nfsv2nodeop_p;
- addaliasu(vp, rdev);
+ vp = addaliasu(vp, rdev);
+ np->n_vnode = vp;
}
np->n_mtime = mtime.tv_sec;
}
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 116e011..4cb2bba 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -183,7 +183,7 @@ struct buf {
#define B_UNUSED0 0x00000008 /* Old B_BAD */
#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
-#define B_UNUSED40 0x00000040 /* Old B_CALL */
+#define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */
#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */
#define B_DONE 0x00000200 /* I/O completed. */
#define B_EINTR 0x00000400 /* I/O was interrupted */
@@ -237,7 +237,7 @@ extern char *buf_wmesg; /* Default buffer lock message */
* Initialize a lock.
*/
#define BUF_LOCKINIT(bp) \
- lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0)
+ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_CANRECURSE)
/*
*
* Get a lock sleeping non-interruptably until it becomes available.
@@ -467,6 +467,7 @@ buf_countdeps(struct buf *bp, int i)
#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */
#define B_SYNC 0x02 /* Do all allocations synchronously. */
#define B_METAONLY 0x04 /* Return indirect block buffer. */
+#define B_NOWAIT 0x08 /* do not sleep to await lock */
#ifdef _KERNEL
extern int nbuf; /* The number of buffer headers */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 3da7897..819681c 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -537,7 +537,7 @@ struct vop_bwrite_args;
extern int (*lease_check_hook) __P((struct vop_lease_args *));
void addalias __P((struct vnode *vp, dev_t nvp_rdev));
-void addaliasu __P((struct vnode *vp, udev_t nvp_rdev));
+struct vnode *addaliasu __P((struct vnode *vp, udev_t nvp_rdev));
int bdevvp __P((dev_t dev, struct vnode **vpp));
/* cache_* may belong in namei.h. */
void cache_enter __P((struct vnode *dvp, struct vnode *vp,
@@ -593,7 +593,8 @@ int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p));
int vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags));
dev_t vn_todev __P((struct vnode *vp));
-int vn_write_suspend_wait __P((struct vnode *vp, int flags));
+int vn_write_suspend_wait __P((struct vnode *vp, struct mount *mp,
+ int flags));
int vn_writechk __P((struct vnode *vp));
int vfs_cache_lookup __P((struct vop_lookup_args *ap));
int vfs_object_create __P((struct vnode *vp, struct proc *p,
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index 8e011bb..1d52ec7 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -116,6 +116,7 @@ extern vop_t **ffs_fifoop_p;
void softdep_initialize __P((void));
int softdep_mount __P((struct vnode *, struct mount *, struct fs *,
struct ucred *));
+int softdep_flushworklist __P((struct mount *, int *, struct proc *));
int softdep_flushfiles __P((struct mount *, int, struct proc *));
void softdep_update_inodeblock __P((struct inode *, struct buf *, int));
void softdep_load_inodeblock __P((struct inode *));
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
index d749abe..af03143 100644
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -30,7 +30,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)ffs_snapshot.c 8.10 (McKusick) 7/11/00
+ * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
* $FreeBSD$
*/
@@ -290,6 +290,7 @@ restart:
if (fs->fs_cgsize < fs->fs_bsize)
bzero(&nbp->b_data[fs->fs_cgsize],
fs->fs_bsize - fs->fs_cgsize);
+ nbp->b_flags |= B_VALIDSUSPWRT;
bawrite(nbp);
base = cg * fs->fs_fpg / fs->fs_frag;
if (base + len > numblks)
@@ -311,6 +312,7 @@ restart:
indiroff = (base + loc - NDADDR) % NINDIR(fs);
for ( ; loc < len; loc++, indiroff++) {
if (indiroff >= NINDIR(fs)) {
+ ibp->b_flags |= B_VALIDSUSPWRT;
bawrite(ibp);
error = VOP_BALLOC(vp,
lblktosize(fs, (off_t)(base + loc)),
@@ -325,7 +327,8 @@ restart:
continue;
((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
}
- brelse(bp);
+ bqrelse(bp);
+ ibp->b_flags |= B_VALIDSUSPWRT;
bdwrite(ibp);
}
/*
@@ -340,6 +343,7 @@ restart:
if (fs->fs_sbsize < fs->fs_bsize)
bzero(&nbp->b_data[fs->fs_sbsize],
fs->fs_bsize - fs->fs_sbsize);
+ nbp->b_flags |= B_VALIDSUSPWRT;
bawrite(nbp);
blkno = fragstoblks(fs, fs->fs_csaddr);
len = howmany(fs->fs_cssize, fs->fs_bsize) - 1;
@@ -354,6 +358,7 @@ restart:
size = fs->fs_cssize % fs->fs_bsize;
}
bcopy(fs->fs_csp[loc], nbp->b_data, size);
+ nbp->b_flags |= B_VALIDSUSPWRT;
bawrite(nbp);
}
/*
@@ -366,6 +371,7 @@ restart:
if (error)
goto out1;
readblock(nbp, inoblks[loc]);
+ nbp->b_flags |= B_VALIDSUSPWRT;
bdwrite(nbp);
}
/*
@@ -410,6 +416,7 @@ restart:
dip->di_blocks = 0;
dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT);
bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
+ nbp->b_flags |= B_VALIDSUSPWRT;
bdwrite(nbp);
}
/*
@@ -422,7 +429,7 @@ restart:
if (error)
goto out1;
copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
- brelse(ibp);
+ bqrelse(ibp);
error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
fs->fs_bsize, p->p_ucred, 0, &nbp);
if (error)
@@ -434,7 +441,8 @@ restart:
goto out1;
}
bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize);
- brelse(ibp);
+ bqrelse(ibp);
+ nbp->b_flags |= B_VALIDSUSPWRT;
bawrite(nbp);
}
/*
@@ -518,7 +526,7 @@ indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
} else {
MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
- brelse(bp);
+ bqrelse(bp);
}
error = snapacct(snapvp, &bap[0], &bap[last]);
if (error || level == 0)
@@ -539,7 +547,7 @@ indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
}
out:
if (snapvp != cancelvp)
- brelse(bp);
+ bqrelse(bp);
else
FREE(bap, M_DEVBUF);
return (error);
@@ -578,8 +586,10 @@ snapacct(vp, oldblkp, lastblkp)
if (*blkp != 0)
panic("snapacct: bad block");
*blkp = BLK_SNAP;
- if (lbn >= NDADDR)
+ if (lbn >= NDADDR) {
+ ibp->b_flags |= B_VALIDSUSPWRT;
bdwrite(ibp);
+ }
}
return (0);
}
@@ -732,7 +742,7 @@ ffs_snapblkfree(freeip, bno, size)
default:
case BLK_NOCOPY:
if (lbn >= NDADDR)
- brelse(ibp);
+ bqrelse(ibp);
continue;
/*
* No previous snapshot claimed the block, so it will be
@@ -787,7 +797,7 @@ ffs_snapblkfree(freeip, bno, size)
return (1);
}
if (lbn >= NDADDR)
- brelse(ibp);
+ bqrelse(ibp);
/*
* Allocate the block into which to do the copy. Note that this
* allocation will never require any additional allocations for
@@ -933,40 +943,57 @@ ffs_copyonwrite(ap)
if (bp->b_vp == vp)
continue;
/*
- * Check to see if block needs to be copied.
+ * Check to see if block needs to be copied. We have to
+ * be able to do the VOP_BALLOC without blocking, otherwise
+ * we may get in a deadlock with another process also
+ * trying to allocate. If we find outselves unable to
+ * get the buffer lock, we unlock the snapshot vnode,
+ * sleep briefly, and try again.
*/
+retry:
+ vn_lock(vp, LK_SHARED | LK_RETRY, p);
if (lbn < NDADDR) {
blkno = ip->i_db[lbn];
} else {
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
p->p_flag |= P_COWINPROGRESS;
error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
- fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
+ fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
p->p_flag &= ~P_COWINPROGRESS;
- VOP_UNLOCK(vp, 0, p);
- if (error)
- break;
+ if (error) {
+ VOP_UNLOCK(vp, 0, p);
+ if (error != EWOULDBLOCK)
+ break;
+ tsleep(vp, p->p_usrpri, "nap", 1);
+ goto retry;
+ }
indiroff = (lbn - NDADDR) % NINDIR(fs);
blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
- brelse(ibp);
+ bqrelse(ibp);
}
#ifdef DIAGNOSTIC
if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
panic("ffs_copyonwrite: bad copy block");
#endif
- if (blkno != 0)
+ if (blkno != 0) {
+ VOP_UNLOCK(vp, 0, p);
continue;
+ }
/*
* Allocate the block into which to do the copy. Note that this
* allocation will never require any additional allocations for
* the snapshot inode.
*/
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
p->p_flag |= P_COWINPROGRESS;
error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
- fs->fs_bsize, KERNCRED, 0, &cbp);
+ fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
p->p_flag &= ~P_COWINPROGRESS;
VOP_UNLOCK(vp, 0, p);
+ if (error) {
+ if (error != EWOULDBLOCK)
+ break;
+ tsleep(vp, p->p_usrpri, "nap", 1);
+ goto retry;
+ }
#ifdef DEBUG
if (snapdebug) {
printf("Copyonwrite: snapino %d lbn %d for ",
@@ -979,8 +1006,6 @@ ffs_copyonwrite(ap)
cbp->b_blkno);
}
#endif
- if (error)
- break;
/*
* If we have already read the old block contents, then
* simply copy them to the new block.
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index d9e6414..cbc37ad 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -548,41 +548,45 @@ softdep_process_worklist(matchmnt)
case D_DIRREM:
/* removal of a directory entry */
mp = WK_DIRREM(wk)->dm_mnt;
+ if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+ panic("%s: dirrem on suspended filesystem",
+ "softdep_process_worklist");
if (mp == matchmnt)
matchcnt += 1;
- vn_start_write(NULL, &mp, V_WAIT);
handle_workitem_remove(WK_DIRREM(wk));
- vn_finished_write(mp);
break;
case D_FREEBLKS:
/* releasing blocks and/or fragments from a file */
mp = WK_FREEBLKS(wk)->fb_mnt;
+ if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+ panic("%s: freeblks on suspended filesystem",
+ "softdep_process_worklist");
if (mp == matchmnt)
matchcnt += 1;
- vn_start_write(NULL, &mp, V_WAIT);
handle_workitem_freeblocks(WK_FREEBLKS(wk));
- vn_finished_write(mp);
break;
case D_FREEFRAG:
/* releasing a fragment when replaced as a file grows */
mp = WK_FREEFRAG(wk)->ff_mnt;
+ if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+ panic("%s: freefrag on suspended filesystem",
+ "softdep_process_worklist");
if (mp == matchmnt)
matchcnt += 1;
- vn_start_write(NULL, &mp, V_WAIT);
handle_workitem_freefrag(WK_FREEFRAG(wk));
- vn_finished_write(mp);
break;
case D_FREEFILE:
/* releasing an inode when its link count drops to 0 */
mp = WK_FREEFILE(wk)->fx_mnt;
+ if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+ panic("%s: freefile on suspended filesystem",
+ "softdep_process_worklist");
if (mp == matchmnt)
matchcnt += 1;
- vn_start_write(NULL, &mp, V_WAIT);
handle_workitem_freefile(WK_FREEFILE(wk));
- vn_finished_write(mp);
break;
default:
@@ -646,13 +650,13 @@ softdep_move_dependencies(oldbp, newbp)
* Purge the work list of all items associated with a particular mount point.
*/
int
-softdep_flushfiles(oldmnt, flags, p)
+softdep_flushworklist(oldmnt, countp, p)
struct mount *oldmnt;
- int flags;
+ int *countp;
struct proc *p;
{
struct vnode *devvp;
- int error, loopcnt;
+ int count, error = 0;
/*
* Await our turn to clear out the queue.
@@ -660,32 +664,16 @@ softdep_flushfiles(oldmnt, flags, p)
while (softdep_worklist_busy)
tsleep(&lbolt, PRIBIO, "softflush", 0);
softdep_worklist_busy = 1;
- if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
- softdep_worklist_busy = 0;
- return (error);
- }
/*
* Alternately flush the block device associated with the mount
* point and process any dependencies that the flushing
- * creates. In theory, this loop can happen at most twice,
- * but we give it a few extra just to be sure.
+ * creates. We continue until no more worklist dependencies
+ * are found.
*/
+ *countp = 0;
devvp = VFSTOUFS(oldmnt)->um_devvp;
- for (loopcnt = 10; loopcnt > 0; ) {
- if (softdep_process_worklist(oldmnt) == 0) {
- loopcnt--;
- /*
- * Do another flush in case any vnodes were brought in
- * as part of the cleanup operations.
- */
- if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
- break;
- /*
- * If we still found nothing to do, we are really done.
- */
- if (softdep_process_worklist(oldmnt) == 0)
- break;
- }
+ while ((count = softdep_process_worklist(oldmnt)) > 0) {
+ *countp += count;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
VOP_UNLOCK(devvp, 0, p);
@@ -693,6 +681,37 @@ softdep_flushfiles(oldmnt, flags, p)
break;
}
softdep_worklist_busy = 0;
+ return (error);
+}
+
+/*
+ * Flush all vnodes and worklist items associated with a specified mount point.
+ */
+int
+softdep_flushfiles(oldmnt, flags, p)
+ struct mount *oldmnt;
+ int flags;
+ struct proc *p;
+{
+ int error, count, loopcnt;
+
+ /*
+ * Alternately flush the vnodes associated with the mount
+ * point and process any dependencies that the flushing
+ * creates. In theory, this loop can happen at most twice,
+ * but we give it a few extra just to be sure.
+ */
+ for (loopcnt = 10; loopcnt > 0; loopcnt--) {
+ /*
+ * Do another flush in case any vnodes were brought in
+ * as part of the cleanup operations.
+ */
+ if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
+ break;
+ if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
+ count == 0)
+ break;
+ }
/*
* If we are unmounting then it is an error to fail. If we
* are simply trying to downgrade to read-only, then filesystem
@@ -4432,8 +4451,8 @@ clear_remove(p)
mp = pagedep->pd_mnt;
ino = pagedep->pd_ino;
FREE_LOCK(&lk);
- if (vn_start_write(NULL, &mp, V_WAIT | PCATCH) != 0)
- return;
+ if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
+ continue;
if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
softdep_error("clear_remove: vget", error);
vn_finished_write(mp);
@@ -4503,8 +4522,8 @@ clear_inodedeps(p)
if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
continue;
FREE_LOCK(&lk);
- if (vn_start_write(NULL, &mp, V_WAIT | PCATCH) != 0)
- return;
+ if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
+ continue;
if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
softdep_error("clear_inodedeps: vget", error);
vn_finished_write(mp);
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index cf0e220..c40be45 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -908,7 +908,7 @@ ffs_sync(mp, waitfor, cred, p)
struct inode *ip;
struct ufsmount *ump = VFSTOUFS(mp);
struct fs *fs;
- int error, allerror = 0;
+ int error, count, wait, lockreq, allerror = 0;
fs = ump->um_fs;
if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */
@@ -918,6 +918,12 @@ ffs_sync(mp, waitfor, cred, p)
/*
* Write back each (modified) inode.
*/
+ wait = 0;
+ lockreq = LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK;
+ if (waitfor == MNT_WAIT) {
+ wait = 1;
+ lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
+ }
simple_lock(&mntvnode_slock);
loop:
for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
@@ -938,9 +944,7 @@ loop:
}
if (vp->v_type != VCHR) {
simple_unlock(&mntvnode_slock);
- error =
- vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p);
- if (error) {
+ if ((error = vget(vp, lockreq, p)) != 0) {
simple_lock(&mntvnode_slock);
if (error == ENOENT)
goto loop;
@@ -948,14 +952,12 @@ loop:
}
if ((error = VOP_FSYNC(vp, cred, waitfor, p)) != 0)
allerror = error;
- VOP_UNLOCK(vp, 0, p);
- vrele(vp);
+ vput(vp);
simple_lock(&mntvnode_slock);
} else {
simple_unlock(&mntvnode_slock);
simple_unlock(&vp->v_interlock);
- /* UFS_UPDATE(vp, waitfor == MNT_WAIT); */
- UFS_UPDATE(vp, 0);
+ UFS_UPDATE(vp, wait);
simple_lock(&mntvnode_slock);
}
}
@@ -963,9 +965,16 @@ loop:
/*
* Force stale file system control information to be flushed.
*/
- if (waitfor != MNT_LAZY) {
- if (ump->um_mountp->mnt_flag & MNT_SOFTDEP)
- waitfor = MNT_NOWAIT;
+ if (waitfor == MNT_WAIT) {
+ if ((error = softdep_flushworklist(ump->um_mountp, &count, p)))
+ allerror = error;
+ /* Flushed work items may create new vnodes to clean */
+ if (count) {
+ simple_lock(&mntvnode_slock);
+ goto loop;
+ }
+ }
+ if (waitfor == MNT_NOWAIT) {
vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p);
if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0)
allerror = error;
diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c
index 5c5e010..429f029 100644
--- a/sys/ufs/mfs/mfs_vfsops.c
+++ b/sys/ufs/mfs/mfs_vfsops.c
@@ -248,7 +248,7 @@ mfs_mount(mp, path, data, ndp, p)
/* It is not clear that these will get initialized otherwise */
dev->si_bsize_phys = DEV_BSIZE;
dev->si_iosize_max = DFLTPHYS;
- addaliasu(devvp, makeudev(253, mfs_minor++));
+ devvp = addaliasu(devvp, makeudev(253, mfs_minor++));
devvp->v_data = mfsp;
mfsp->mfs_baseoff = args.base;
mfsp->mfs_size = args.size;
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index ab4ac52..40fdd65 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -147,7 +147,18 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
num = *nump;
if (num == 0) {
*bnp = blkptrtodb(ump, ip->i_db[bn]);
- if (*bnp == 0) {
+ /*
+ * Since this is FFS independent code, we are out of
+ * scope for the definitions of BLK_NOCOPY and
+ * BLK_SNAP, but we do know that they will fall in
+ * the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts
+ * are made to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & SF_SNAPSHOT) &&
+ ip->i_db[bn] > 0 && ip->i_db[bn] < ump->um_seqinc) {
+ *bnp = -1;
+ } else if (*bnp == 0) {
if (ip->i_flags & SF_SNAPSHOT)
*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
else
@@ -230,6 +241,17 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
if (bp)
bqrelse(bp);
+ /*
+ * Since this is FFS independent code, we are out of scope for the
+ * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+ * will fall in the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts are made
+ * to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){
+ *bnp = -1;
+ return (0);
+ }
*bnp = blkptrtodb(ump, daddr);
if (*bnp == 0) {
if (ip->i_flags & SF_SNAPSHOT)
diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c
index 485a6d2..b700fd3 100644
--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@@ -77,7 +77,7 @@ ufs_inactive(ap)
if (ip->i_mode == 0)
goto out;
if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
- (void) vn_write_suspend_wait(vp, V_WAIT);
+ (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
#ifdef QUOTA
if (!getinoquota(ip))
(void)chkiq(ip, -1, NOCRED, 0);
@@ -94,10 +94,10 @@ ufs_inactive(ap)
}
if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) {
if ((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
- vn_write_suspend_wait(vp, V_NOWAIT)) {
+ vn_write_suspend_wait(vp, NULL, V_NOWAIT)) {
ip->i_flag &= ~IN_ACCESS;
} else {
- (void) vn_write_suspend_wait(vp, V_WAIT);
+ (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
UFS_UPDATE(vp, 0);
}
}
diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c
index 6396f67..19b3dad 100644
--- a/sys/ufs/ufs/ufs_quota.c
+++ b/sys/ufs/ufs/ufs_quota.c
@@ -898,7 +898,7 @@ dqsync(vp, dq)
return (0);
if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
panic("dqsync: file");
- (void) vn_write_suspend_wait(dqvp, V_WAIT);
+ (void) vn_write_suspend_wait(dqvp, NULL, V_WAIT);
if (vp != dqvp)
vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, p);
while (dq->dq_flags & DQ_LOCK) {
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index d97568c..0fac626 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -2036,7 +2036,8 @@ ufs_vinit(mntp, specops, fifoops, vpp)
case VCHR:
case VBLK:
vp->v_op = specops;
- addaliasu(vp, ip->i_rdev);
+ vp = addaliasu(vp, ip->i_rdev);
+ ip->i_vnode = vp;
break;
case VFIFO:
vp->v_op = fifoops;
OpenPOWER on IntegriCloud