3 files changed, 149 insertions, 46 deletions
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index 10be58d..8e99d98 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -480,20 +480,30 @@ nfs_bioread(vp, uio, ioflag, cred)
 
 		/*
 		 * Obtain the buffer cache block.  Figure out the buffer size
-		 * when we are at EOF.  nfs_getcacheblk() will also force
-		 * uncached delayed-writes to be flushed to the server.
+		 * when we are at EOF.  If we are modifying the size of the
+		 * buffer based on an EOF condition we need to hold 
+		 * nfs_rslock() through obtaining the buffer to prevent
+		 * a potential writer-appender from messing with n_size.
+		 * Otherwise we may accidently truncate the buffer and
+		 * lose dirty data.
 		 *
 		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
 
+again:
 		bcount = biosize;
 		if ((off_t)lbn * biosize >= np->n_size) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
 			bcount = np->n_size - (off_t)lbn * biosize;
 		}
+		if (bcount != biosize && nfs_rslock(np, p) == ENOLCK)
+			goto again;
 
 		bp = nfs_getcacheblk(vp, lbn, bcount, p);
+
+		if (bcount != biosize)
+			nfs_rsunlock(np, p);
 		if (!bp)
 			return (EINTR);
 
@@ -708,6 +718,7 @@ nfs_write(ap)
 	daddr_t lbn;
 	int bcount;
 	int n, on, error = 0, iomode, must_commit;
+	int haverslock = 0;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
@@ -724,6 +735,11 @@ nfs_write(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	/*
+	 * Synchronously flush pending buffers if we are in synchronous
+	 * mode or if we are appending.
+	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		if (np->n_flag & NMODIFIED) {
 			np->n_attrstamp = 0;
@@ -731,20 +747,49 @@ nfs_write(ap)
 			if (error)
 				return (error);
 		}
-		if (ioflag & IO_APPEND) {
-			np->n_attrstamp = 0;
-			error = VOP_GETATTR(vp, &vattr, cred, p);
-			if (error)
-				return (error);
-			uio->uio_offset = np->n_size;
-		}
 	}
+
+	/*
+	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
+	 * get the append lock.
+	 */
+restart:
+	if (ioflag & IO_APPEND) {
+		np->n_attrstamp = 0;
+		error = VOP_GETATTR(vp, &vattr, cred, p);
+		if (error)
+			return (error);
+		uio->uio_offset = np->n_size;
+	}
+
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
+
+	/*
+	 * We need to obtain the rslock if we intend to modify np->n_size
+	 * in order to guarentee the append point with multiple contending
+	 * writers, to guarentee that no other appenders modify n_size
+	 * while we are trying to obtain a truncated buffer (i.e. to avoid
+	 * accidently truncating data written by another appender due to
+	 * the race), and to ensure that the buffer is populated prior to
+	 * our extending of the file.  We hold rslock through the entire
+	 * operation.
+	 *
+	 * Note that we do not synchronize the case where someone truncates
+	 * the file while we are appending to it because attempting to lock
+	 * this case may deadlock other parts of the system unexpectedly.
+	 */
+	if ((ioflag & IO_APPEND) ||
+	    uio->uio_offset + uio->uio_resid > np->n_size) {
+		if (nfs_rslock(np, p) == ENOLCK)
+			goto restart;
+		haverslock = 1;
+	}
+
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
@@ -752,6 +797,8 @@ nfs_write(ap)
 	if (p && uio->uio_offset + uio->uio_resid >
 	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 		psignal(p, SIGXFSZ);
+		if (haverslock)
+			nfs_rsunlock(np, p);
 		return (EFBIG);
 	}
 
@@ -767,12 +814,12 @@ nfs_write(ap)
 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 			} while (error == NQNFS_EXPIRED);
 			if (error)
-				return (error);
+				break;
 			if (np->n_lrev != np->n_brev ||
 			    (np->n_flag & NQNFSNONCACHE)) {
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 				if (error)
-					return (error);
+					break;
 				np->n_brev = np->n_lrev;
 			}
 		}
@@ -780,8 +827,8 @@ nfs_write(ap)
 		    iomode = NFSV3WRITE_FILESYNC;
 		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 		    if (must_commit)
-			nfs_clearcommit(vp->v_mount);
-		    return (error);
+			    nfs_clearcommit(vp->v_mount);
+		    break;
 		}
 		nfsstats.biocache_writes++;
 		lbn = uio->uio_offset / biosize;
@@ -795,36 +842,51 @@ again:
 
 		if (uio->uio_offset == np->n_size && n) {
 			/*
-			 * special append case.  Obtain buffer prior to
-			 * resizing it to maintain B_CACHE.
+			 * Get the buffer (in its pre-append state to maintain
+			 * B_CACHE if it was previously set).  Resize the
+			 * nfsnode after we have locked the buffer to prevent
+			 * readers from reading garbage.
 			 */
-			long save;
-
 			bcount = on;
 			bp = nfs_getcacheblk(vp, lbn, bcount, p);
-			if (!bp)
-				return (EINTR);
-			save = bp->b_flags & B_CACHE;
 
-			np->n_size = uio->uio_offset + n;
-			np->n_flag |= NMODIFIED;
-			vnode_pager_setsize(vp, np->n_size);
+			if (bp != NULL) {
+				long save;
+
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
 
-			bcount += n;
-			allocbuf(bp, bcount);
-			bp->b_flags |= save;
+				save = bp->b_flags & B_CACHE;
+				bcount += n;
+				allocbuf(bp, bcount);
+				bp->b_flags |= save;
+			}
 		} else {
+			/*
+			 * Obtain the locked cache block first, and then 
+			 * adjust the file's size as appropriate.
+			 */
+			bcount = on + n;
+			if ((off_t)lbn * biosize + bcount < np->n_size) {
+				if ((off_t)(lbn + 1) * biosize < np->n_size)
+					bcount = biosize;
+				else
+					bcount = np->n_size - (off_t)lbn * biosize;
+			}
+
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
-			bcount = biosize;
-			if ((off_t)(lbn + 1) * biosize > np->n_size) 
-				bcount = np->n_size - (off_t)lbn * biosize;
-			bp = nfs_getcacheblk(vp, lbn, bcount, p);
-			if (!bp)
-				return (EINTR);
+		}
+
+		if (!bp) {
+			error = EINTR;
+			break;
 		}
 
 		/*
@@ -857,11 +919,13 @@ again:
 			error = nfs_doio(bp, cred, p);
 			if (error) {
 				brelse(bp);
-				return (error);
+				break;
 			}
 		}
-		if (!bp)
-			return (EINTR);
+		if (!bp) {
+			error = EINTR;
+			break;
+		}
 		if (bp->b_wcred == NOCRED) {
 			crhold(cred);
 			bp->b_wcred = cred;
@@ -869,13 +933,21 @@ again:
 		np->n_flag |= NMODIFIED;
 
 		/*
-		 * If dirtyend exceeds file size, chop it down.  If this
-		 * creates a reverse-indexed or degenerate situation with
-		 * dirtyoff/end, 0 them.
+		 * If dirtyend exceeds file size, chop it down.  This should
+		 * not normally occur but there is an append race where it
+		 * might occur XXX, so we log it. 
+		 *
+		 * If the chopping creates a reverse-indexed or degenerate
+		 * situation with dirtyoff/end, we 0 both of them.
 		 */
 
-		if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
-			bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
+		if (bp->b_dirtyend > bcount) {
+			printf("NFS append race @%lx:%d\n", 
+			    (long)bp->b_blkno * DEV_BSIZE, 
+			    bp->b_dirtyend - bcount);
+			bp->b_dirtyend = bcount;
+		}
+
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
@@ -914,14 +986,14 @@ again:
 			} while (error == NQNFS_EXPIRED);
 			if (error) {
 				brelse(bp);
-				return (error);
+				break;
 			}
 			if (np->n_lrev != np->n_brev ||
 			    (np->n_flag & NQNFSNONCACHE)) {
 				brelse(bp);
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 				if (error)
-					return (error);
+					break;
 				np->n_brev = np->n_lrev;
 				goto again;
 			}
@@ -940,7 +1012,7 @@ again:
 		if (error) {
 			bp->b_flags |= B_ERROR;
 			brelse(bp);
-			return (error);
+			break;
 		}
 
 		/*
@@ -969,11 +1041,11 @@ again:
 				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp->b_vp, bp);
 			if (error)
-				return (error);
+				break;
 			if (np->n_flag & NQNFSNONCACHE) {
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 				if (error)
-					return (error);
+					break;
 			}
 		} else if ((n + on) == biosize &&
 			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
@@ -983,7 +1055,11 @@ again:
 			bdwrite(bp);
 		}
 	} while (uio->uio_resid > 0 && n > 0);
-	return (0);
+
+	if (haverslock)
+		nfs_rsunlock(np, p);
+
+	return (error);
 }
 
 /*
diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c
index 53c73a1..4db6fc2 100644
--- a/sys/nfs/nfs_node.c
+++ b/sys/nfs/nfs_node.c
@@ -180,6 +180,7 @@ loop:
 		np->n_fhp = &np->n_fh;
 	bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
 	np->n_fhsize = fhsize;
+	lockinit(&np->n_rslock, PVFS, "nfrslk", 0, LK_NOPAUSE);
 	*npp = np;
 
 	if (nfs_node_hash_lock < 0)
diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h
index 3bc818f..1ca4529 100644
--- a/sys/nfs/nfsnode.h
+++ b/sys/nfs/nfsnode.h
@@ -118,6 +118,7 @@ struct nfsnode {
 	short			n_fhsize;	/* size in bytes, of fh */
 	short			n_flag;		/* Flag for locking.. */
 	nfsfh_t			n_fh;		/* Small File Handle */
+	struct lock		n_rslock;
 };
 
 #define n_atim		n_un1.nf_atim
@@ -157,6 +158,31 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
 
 #if defined(KERNEL) || defined(_KERNEL)
+
+/*
+ *	nfs_rslock -	Attempt to obtain lock on nfsnode
+ *
+ *	Attempt to obtain a lock on the passed nfsnode, returning ENOLCK
+ *	if the lock could not be obtained due to our having to sleep.  This
+ *	function is generally used to lock around code that modifies an
+ *	NFS file's size.  In order to avoid deadlocks the lock
+ *	should not be obtained while other locks are being held.
+ */
+
+static __inline
+int
+nfs_rslock(struct nfsnode *np, struct proc *p)
+{
+        return(lockmgr(&np->n_rslock, LK_EXCLUSIVE | LK_CANRECURSE | LK_SLEEPFAIL, NULL, p));
+}
+
+static __inline
+void
+nfs_rsunlock(struct nfsnode *np, struct proc *p)
+{
+	(void)lockmgr(&np->n_rslock, LK_RELEASE, NULL, p);
+}
+
 extern	vop_t	**fifo_nfsv2nodeop_p;
 extern	vop_t	**nfsv2_vnodeop_p;
 extern	vop_t	**spec_nfsv2nodeop_p;