Fix two problems: First, fix the append seek position race that can

occur due to np->n_size potentially changing if nfs_getcacheblk() blocks in nfs_write(). Second, under -current we must supply the proper bufsize when obtaining buffers that straddle the EOF, but due to the fact that np->n_size can change out from under us it is possible that we may specify the wrong buffer size and wind up truncating dirty data written by another process. Both problems are solved by implementing nfs_rslock(), which allows us to lock around sensitive buffer cache operations such as those that occur when appending to a file. It is believed that this race is responsible for causing dirtyoff/dirtyend and (in stable) validoff/validend to exceed the buffer size. Therefore we have now added a warning printf for the dirtyoff/end case in current. However, we have introduced a new problem which we need to fix at some point, and that is that soft or intr NFS mounts may become uninterruptable from the point of view of process A which is stuck waiting on rslock while process B is stuck doing the rpc. To unstick process A, process B would have to be interrupted first. Reviewed by: Alfred Perlstein <bright@wintelcom.net>
author: dillon <dillon@FreeBSD.org> 1999-12-14 19:07:54 +0000
committer: dillon <dillon@FreeBSD.org> 1999-12-14 19:07:54 +0000
commit: 3968ced3f8a2f899bd9f39b2b067f2236d485ce5 (patch)
tree: 6751051d6eb810f743e91024499cb430268b96e2 /sys/nfs
parent: 4c83e2cd181003d28cb3352704faaa4cd1acdeff (diff)
download: FreeBSD-src-3968ced3f8a2f899bd9f39b2b067f2236d485ce5.zip
FreeBSD-src-3968ced3f8a2f899bd9f39b2b067f2236d485ce5.tar.gz
3 files changed, 149 insertions, 46 deletions
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index 10be58d..8e99d98 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -480,20 +480,30 @@ nfs_bioread(vp, uio, ioflag, cred)
 
 		/*
 		 * Obtain the buffer cache block.  Figure out the buffer size
-		 * when we are at EOF.  nfs_getcacheblk() will also force
-		 * uncached delayed-writes to be flushed to the server.
+		 * when we are at EOF.  If we are modifying the size of the
+		 * buffer based on an EOF condition we need to hold 
+		 * nfs_rslock() through obtaining the buffer to prevent
+		 * a potential writer-appender from messing with n_size.
+		 * Otherwise we may accidently truncate the buffer and
+		 * lose dirty data.
 		 *
 		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
 
+again:
 		bcount = biosize;
 		if ((off_t)lbn * biosize >= np->n_size) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
 			bcount = np->n_size - (off_t)lbn * biosize;
 		}
+		if (bcount != biosize && nfs_rslock(np, p) == ENOLCK)
+			goto again;
 
 		bp = nfs_getcacheblk(vp, lbn, bcount, p);
+
+		if (bcount != biosize)
+			nfs_rsunlock(np, p);
 		if (!bp)
 			return (EINTR);
 
@@ -708,6 +718,7 @@ nfs_write(ap)
 	daddr_t lbn;
 	int bcount;
 	int n, on, error = 0, iomode, must_commit;
+	int haverslock = 0;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
@@ -724,6 +735,11 @@ nfs_write(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	/*
+	 * Synchronously flush pending buffers if we are in synchronous
+	 * mode or if we are appending.
+	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		if (np->n_flag & NMODIFIED) {
 			np->n_attrstamp = 0;
@@ -731,20 +747,49 @@ nfs_write(ap)
 			if (error)
 				return (error);
 		}
-		if (ioflag & IO_APPEND) {
-			np->n_attrstamp = 0;
-			error = VOP_GETATTR(vp, &vattr, cred, p);
-			if (error)
-				return (error);
-			uio->uio_offset = np->n_size;
-		}
 	}
+
+	/*
+	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
+	 * get the append lock.
+	 */
+restart:
+	if (ioflag & IO_APPEND) {
+		np->n_attrstamp = 0;
+		error = VOP_GETATTR(vp, &vattr, cred, p);
+		if (error)
+			return (error);
+		uio->uio_offset = np->n_size;
+	}
+
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
+
+	/*
+	 * We need to obtain the rslock if we intend to modify np->n_size
+	 * in order to guarentee the append point with multiple contending
+	 * writers, to guarentee that no other appenders modify n_size
+	 * while we are trying to obtain a truncated buffer (i.e. to avoid
+	 * accidently truncating data written by another appender due to
+	 * the race), and to ensure that the buffer is populated prior to
+	 * our extending of the file.  We hold rslock through the entire
+	 * operation.
+	 *
+	 * Note that we do not synchronize the case where someone truncates
+	 * the file while we are appending to it because attempting to lock
+	 * this case may deadlock other parts of the system unexpectedly.
+	 */
+	if ((ioflag & IO_APPEND) ||
+	    uio->uio_offset + uio->uio_resid > np->n_size) {
+		if (nfs_rslock(np, p) == ENOLCK)
+			goto restart;
+		haverslock = 1;
+	}
+
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
@@ -752,6 +797,8 @@ nfs_write(ap)
 	if (p && uio->uio_offset + uio->uio_resid >
 	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 		psignal(p, SIGXFSZ);
+		if (haverslock)
+			nfs_rsunlock(np, p);
 		return (EFBIG);
 	}
 
@@ -767,12 +814,12 @@ nfs_write(ap)
 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 			} while (error == NQNFS_EXPIRED);
 			if (error)
-				return (error);
+				break;
 			if (np->n_lrev != np->n_brev ||
 			    (np->n_flag & NQNFSNONCACHE)) {
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 				if (error)
-					return (error);
+					break;
 				np->n_brev = np->n_lrev;
 			}
 		}
@@ -780,8 +827,8 @@ nfs_write(ap)
 		    iomode = NFSV3WRITE_FILESYNC;
 		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 		    if (must_commit)
-			nfs_clearcommit(vp->v_mount);
-		    return (error);
+			    nfs_clearcommit(vp->v_mount);
+		    break;
 		}
 		nfsstats.biocache_writes++;
 		lbn = uio->uio_offset / biosize;
@@ -795,36 +842,51 @@ again:
 
 		if (uio->uio_offset == np->n_size && n) {
 			/*
-			 * special append case.  Obtain buffer prior to
-			 * resizing it to maintain B_CACHE.
+			 * Get the buffer (in its pre-append state to maintain
+			 * B_CACHE if it was previously set).  Resize the
+			 * nfsnode after we have locked the buffer to prevent
+			 * readers from reading garbage.
 			 */
-			long save;
-
 			bcount = on;
 			bp = nfs_getcacheblk(vp, lbn, bcount, p);
-			if (!bp)
-				return (EINTR);
-			save = bp->b_flags & B_CACHE;
 
-			np->n_size = uio->uio_offset + n;
-			np->n_flag |= NMODIFIED;
-			vnode_pager_setsize(vp, np->n_size);
+			if (bp != NULL) {
+				long save;
+
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
 
-			bcount += n;
-			allocbuf(bp, bcount);
-			bp->b_flags |= save;
+				save = bp->b_flags & B_CACHE;
+				bcount += n;
+				allocbuf(bp, bcount);
+				bp->b_flags |= save;
+			}
 		} else {
+			/*
+			 * Obtain the locked cache block first, and then 
+			 * adjust the file's size as appropriate.
+			 */
+			bcount = on + n;
+			if ((off_t)lbn * biosize + bcount < np->n_size) {
+				if ((off_t)(lbn + 1) * biosize < np->n_size)
+					bcount = biosize;
+				else
+					bcount = np->n_size - (off_t)lbn * biosize;
+			}
+
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
-			bcount = biosize;
-			if ((off_t)(lbn + 1) * biosize > np->n_size) 
-				bcount = np->n_size - (off_t)lbn * biosize;
-			bp = nfs_getcacheblk(vp, lbn, bcount, p);
-			if (!bp)
-				return (EINTR);
+		}
+
+		if (!bp) {
+			error = EINTR;
+			break;
 		}
 
 		/*
@@ -857,11 +919,13 @@ again:
 			error = nfs_doio(bp, cred, p);
 			if (error) {
 				brelse(bp);
-				return (error);
+				break;
 			}
 		}
-		if (!bp)
-			return (EINTR);
+		if (!bp) {
+			error = EINTR;
+			break;
+		}
 		if (bp->b_wcred == NOCRED) {
 			crhold(cred);
 			bp->b_wcred = cred;
@@ -869,13 +933,21 @@ again:
 		np->n_flag |= NMODIFIED;
 
 		/*
-		 * If dirtyend exceeds file size, chop it down.  If this
-		 * creates a reverse-indexed or degenerate situation with
-		 * dirtyoff/end, 0 them.
+		 * If dirtyend exceeds file size, chop it down.  This should
+		 * not normally occur but there is an append race where it
+		 * might occur XXX, so we log it. 
+		 *
+		 * If the chopping creates a reverse-indexed or degenerate
+		 * situation with dirtyoff/end, we 0 both of them.
 		 */
 
-		if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
-			bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
+		if (bp->b_dirtyend > bcount) {
+			printf("NFS append race @%lx:%d\n", 
+			    (long)bp->b_blkno * DEV_BSIZE, 
+			    bp->b_dirtyend - bcount);
+			bp->b_dirtyend = bcount;
+		}
+
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
@@ -914,14 +986,14 @@ again:
 			} while (error == NQNFS_EXPIRED);
 			if (error) {
 				brelse(bp);
-				return (error);
+				break;
 			}
 			if (np->n_lrev != np->n_brev ||
 			    (np->n_flag & NQNFSNONCACHE)) {
 				brelse(bp);
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 				if (error)
-					return (error);
+					break;
 				np->n_brev = np->n_lrev;
 				goto again;
 			}
@@ -940,7 +1012,7 @@ again:
 		if (error) {
 			bp->b_flags |= B_ERROR;
 			brelse(bp);
-			return (error);
+			break;
 		}
 
 		/*
@@ -969,11 +1041,11 @@ again:
 				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp->b_vp, bp);
 			if (error)
-				return (error);
+				break;
 			if (np->n_flag & NQNFSNONCACHE) {
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 				if (error)
-					return (error);
+					break;
 			}
 		} else if ((n + on) == biosize &&
 			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
@@ -983,7 +1055,11 @@ again:
 			bdwrite(bp);
 		}
 	} while (uio->uio_resid > 0 && n > 0);
-	return (0);
+
+	if (haverslock)
+		nfs_rsunlock(np, p);
+
+	return (error);
 }
 
 /*
diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c
index 53c73a1..4db6fc2 100644
--- a/sys/nfs/nfs_node.c
+++ b/sys/nfs/nfs_node.c
@@ -180,6 +180,7 @@ loop:
 		np->n_fhp = &np->n_fh;
 	bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
 	np->n_fhsize = fhsize;
+	lockinit(&np->n_rslock, PVFS, "nfrslk", 0, LK_NOPAUSE);
 	*npp = np;
 
 	if (nfs_node_hash_lock < 0)
diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h
index 3bc818f..1ca4529 100644
--- a/sys/nfs/nfsnode.h
+++ b/sys/nfs/nfsnode.h
@@ -118,6 +118,7 @@ struct nfsnode {
 	short			n_fhsize;	/* size in bytes, of fh */
 	short			n_flag;		/* Flag for locking.. */
 	nfsfh_t			n_fh;		/* Small File Handle */
+	struct lock		n_rslock;
 };
 
 #define n_atim		n_un1.nf_atim
@@ -157,6 +158,31 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
 
 #if defined(KERNEL) || defined(_KERNEL)
+
+/*
+ *	nfs_rslock -	Attempt to obtain lock on nfsnode
+ *
+ *	Attempt to obtain a lock on the passed nfsnode, returning ENOLCK
+ *	if the lock could not be obtained due to our having to sleep.  This
+ *	function is generally used to lock around code that modifies an
+ *	NFS file's size.  In order to avoid deadlocks the lock
+ *	should not be obtained while other locks are being held.
+ */
+
+static __inline
+int
+nfs_rslock(struct nfsnode *np, struct proc *p)
+{
+        return(lockmgr(&np->n_rslock, LK_EXCLUSIVE | LK_CANRECURSE | LK_SLEEPFAIL, NULL, p));
+}
+
+static __inline
+void
+nfs_rsunlock(struct nfsnode *np, struct proc *p)
+{
+	(void)lockmgr(&np->n_rslock, LK_RELEASE, NULL, p);
+}
+
 extern	vop_t	**fifo_nfsv2nodeop_p;
 extern	vop_t	**nfsv2_vnodeop_p;
 extern	vop_t	**spec_nfsv2nodeop_p;
author	dillon <dillon@FreeBSD.org>	1999-12-14 19:07:54 +0000
committer	dillon <dillon@FreeBSD.org>	1999-12-14 19:07:54 +0000
commit	3968ced3f8a2f899bd9f39b2b067f2236d485ce5 (patch)
tree	6751051d6eb810f743e91024499cb430268b96e2 /sys/nfs
parent	4c83e2cd181003d28cb3352704faaa4cd1acdeff (diff)
download	FreeBSD-src-3968ced3f8a2f899bd9f39b2b067f2236d485ce5.zip FreeBSD-src-3968ced3f8a2f899bd9f39b2b067f2236d485ce5.tar.gz