summaryrefslogtreecommitdiffstats
path: root/sys/nfs
diff options
context:
space:
mode:
Diffstat (limited to 'sys/nfs')
-rw-r--r--sys/nfs/nfs_bio.c168
-rw-r--r--sys/nfs/nfs_node.c1
-rw-r--r--sys/nfs/nfsnode.h26
3 files changed, 149 insertions, 46 deletions
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index 10be58d..8e99d98 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -480,20 +480,30 @@ nfs_bioread(vp, uio, ioflag, cred)
/*
* Obtain the buffer cache block. Figure out the buffer size
- * when we are at EOF. nfs_getcacheblk() will also force
- * uncached delayed-writes to be flushed to the server.
+ * when we are at EOF. If we are modifying the size of the
+ * buffer based on an EOF condition we need to hold
+ * nfs_rslock() through obtaining the buffer to prevent
+ * a potential writer-appender from messing with n_size.
+ * Otherwise we may accidently truncate the buffer and
+ * lose dirty data.
*
* Note that bcount is *not* DEV_BSIZE aligned.
*/
+again:
bcount = biosize;
if ((off_t)lbn * biosize >= np->n_size) {
bcount = 0;
} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
bcount = np->n_size - (off_t)lbn * biosize;
}
+ if (bcount != biosize && nfs_rslock(np, p) == ENOLCK)
+ goto again;
bp = nfs_getcacheblk(vp, lbn, bcount, p);
+
+ if (bcount != biosize)
+ nfs_rsunlock(np, p);
if (!bp)
return (EINTR);
@@ -708,6 +718,7 @@ nfs_write(ap)
daddr_t lbn;
int bcount;
int n, on, error = 0, iomode, must_commit;
+ int haverslock = 0;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_WRITE)
@@ -724,6 +735,11 @@ nfs_write(ap)
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
(void)nfs_fsinfo(nmp, vp, cred, p);
+
+ /*
+ * Synchronously flush pending buffers if we are in synchronous
+ * mode or if we are appending.
+ */
if (ioflag & (IO_APPEND | IO_SYNC)) {
if (np->n_flag & NMODIFIED) {
np->n_attrstamp = 0;
@@ -731,20 +747,49 @@ nfs_write(ap)
if (error)
return (error);
}
- if (ioflag & IO_APPEND) {
- np->n_attrstamp = 0;
- error = VOP_GETATTR(vp, &vattr, cred, p);
- if (error)
- return (error);
- uio->uio_offset = np->n_size;
- }
}
+
+ /*
+ * If IO_APPEND then load uio_offset. We restart here if we cannot
+ * get the append lock.
+ */
+restart:
+ if (ioflag & IO_APPEND) {
+ np->n_attrstamp = 0;
+ error = VOP_GETATTR(vp, &vattr, cred, p);
+ if (error)
+ return (error);
+ uio->uio_offset = np->n_size;
+ }
+
if (uio->uio_offset < 0)
return (EINVAL);
if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
return (EFBIG);
if (uio->uio_resid == 0)
return (0);
+
+ /*
+ * We need to obtain the rslock if we intend to modify np->n_size
+ * in order to guarentee the append point with multiple contending
+ * writers, to guarentee that no other appenders modify n_size
+ * while we are trying to obtain a truncated buffer (i.e. to avoid
+ * accidently truncating data written by another appender due to
+ * the race), and to ensure that the buffer is populated prior to
+ * our extending of the file. We hold rslock through the entire
+ * operation.
+ *
+ * Note that we do not synchronize the case where someone truncates
+ * the file while we are appending to it because attempting to lock
+ * this case may deadlock other parts of the system unexpectedly.
+ */
+ if ((ioflag & IO_APPEND) ||
+ uio->uio_offset + uio->uio_resid > np->n_size) {
+ if (nfs_rslock(np, p) == ENOLCK)
+ goto restart;
+ haverslock = 1;
+ }
+
/*
* Maybe this should be above the vnode op call, but so long as
* file servers have no limits, i don't think it matters
@@ -752,6 +797,8 @@ nfs_write(ap)
if (p && uio->uio_offset + uio->uio_resid >
p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
psignal(p, SIGXFSZ);
+ if (haverslock)
+ nfs_rsunlock(np, p);
return (EFBIG);
}
@@ -767,12 +814,12 @@ nfs_write(ap)
error = nqnfs_getlease(vp, ND_WRITE, cred, p);
} while (error == NQNFS_EXPIRED);
if (error)
- return (error);
+ break;
if (np->n_lrev != np->n_brev ||
(np->n_flag & NQNFSNONCACHE)) {
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
- return (error);
+ break;
np->n_brev = np->n_lrev;
}
}
@@ -780,8 +827,8 @@ nfs_write(ap)
iomode = NFSV3WRITE_FILESYNC;
error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
if (must_commit)
- nfs_clearcommit(vp->v_mount);
- return (error);
+ nfs_clearcommit(vp->v_mount);
+ break;
}
nfsstats.biocache_writes++;
lbn = uio->uio_offset / biosize;
@@ -795,36 +842,51 @@ again:
if (uio->uio_offset == np->n_size && n) {
/*
- * special append case. Obtain buffer prior to
- * resizing it to maintain B_CACHE.
+ * Get the buffer (in its pre-append state to maintain
+ * B_CACHE if it was previously set). Resize the
+ * nfsnode after we have locked the buffer to prevent
+ * readers from reading garbage.
*/
- long save;
-
bcount = on;
bp = nfs_getcacheblk(vp, lbn, bcount, p);
- if (!bp)
- return (EINTR);
- save = bp->b_flags & B_CACHE;
- np->n_size = uio->uio_offset + n;
- np->n_flag |= NMODIFIED;
- vnode_pager_setsize(vp, np->n_size);
+ if (bp != NULL) {
+ long save;
+
+ np->n_size = uio->uio_offset + n;
+ np->n_flag |= NMODIFIED;
+ vnode_pager_setsize(vp, np->n_size);
- bcount += n;
- allocbuf(bp, bcount);
- bp->b_flags |= save;
+ save = bp->b_flags & B_CACHE;
+ bcount += n;
+ allocbuf(bp, bcount);
+ bp->b_flags |= save;
+ }
} else {
+ /*
+ * Obtain the locked cache block first, and then
+ * adjust the file's size as appropriate.
+ */
+ bcount = on + n;
+ if ((off_t)lbn * biosize + bcount < np->n_size) {
+ if ((off_t)(lbn + 1) * biosize < np->n_size)
+ bcount = biosize;
+ else
+ bcount = np->n_size - (off_t)lbn * biosize;
+ }
+
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
+
if (uio->uio_offset + n > np->n_size) {
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
vnode_pager_setsize(vp, np->n_size);
}
- bcount = biosize;
- if ((off_t)(lbn + 1) * biosize > np->n_size)
- bcount = np->n_size - (off_t)lbn * biosize;
- bp = nfs_getcacheblk(vp, lbn, bcount, p);
- if (!bp)
- return (EINTR);
+ }
+
+ if (!bp) {
+ error = EINTR;
+ break;
}
/*
@@ -857,11 +919,13 @@ again:
error = nfs_doio(bp, cred, p);
if (error) {
brelse(bp);
- return (error);
+ break;
}
}
- if (!bp)
- return (EINTR);
+ if (!bp) {
+ error = EINTR;
+ break;
+ }
if (bp->b_wcred == NOCRED) {
crhold(cred);
bp->b_wcred = cred;
@@ -869,13 +933,21 @@ again:
np->n_flag |= NMODIFIED;
/*
- * If dirtyend exceeds file size, chop it down. If this
- * creates a reverse-indexed or degenerate situation with
- * dirtyoff/end, 0 them.
+ * If dirtyend exceeds file size, chop it down. This should
+ * not normally occur but there is an append race where it
+ * might occur XXX, so we log it.
+ *
+ * If the chopping creates a reverse-indexed or degenerate
+ * situation with dirtyoff/end, we 0 both of them.
*/
- if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
- bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
+ if (bp->b_dirtyend > bcount) {
+ printf("NFS append race @%lx:%d\n",
+ (long)bp->b_blkno * DEV_BSIZE,
+ bp->b_dirtyend - bcount);
+ bp->b_dirtyend = bcount;
+ }
+
if (bp->b_dirtyoff >= bp->b_dirtyend)
bp->b_dirtyoff = bp->b_dirtyend = 0;
@@ -914,14 +986,14 @@ again:
} while (error == NQNFS_EXPIRED);
if (error) {
brelse(bp);
- return (error);
+ break;
}
if (np->n_lrev != np->n_brev ||
(np->n_flag & NQNFSNONCACHE)) {
brelse(bp);
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
- return (error);
+ break;
np->n_brev = np->n_lrev;
goto again;
}
@@ -940,7 +1012,7 @@ again:
if (error) {
bp->b_flags |= B_ERROR;
brelse(bp);
- return (error);
+ break;
}
/*
@@ -969,11 +1041,11 @@ again:
bp->b_flags |= B_NOCACHE;
error = VOP_BWRITE(bp->b_vp, bp);
if (error)
- return (error);
+ break;
if (np->n_flag & NQNFSNONCACHE) {
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
- return (error);
+ break;
}
} else if ((n + on) == biosize &&
(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
@@ -983,7 +1055,11 @@ again:
bdwrite(bp);
}
} while (uio->uio_resid > 0 && n > 0);
- return (0);
+
+ if (haverslock)
+ nfs_rsunlock(np, p);
+
+ return (error);
}
/*
diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c
index 53c73a1..4db6fc2 100644
--- a/sys/nfs/nfs_node.c
+++ b/sys/nfs/nfs_node.c
@@ -180,6 +180,7 @@ loop:
np->n_fhp = &np->n_fh;
bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
np->n_fhsize = fhsize;
+ lockinit(&np->n_rslock, PVFS, "nfrslk", 0, LK_NOPAUSE);
*npp = np;
if (nfs_node_hash_lock < 0)
diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h
index 3bc818f..1ca4529 100644
--- a/sys/nfs/nfsnode.h
+++ b/sys/nfs/nfsnode.h
@@ -118,6 +118,7 @@ struct nfsnode {
short n_fhsize; /* size in bytes, of fh */
short n_flag; /* Flag for locking.. */
nfsfh_t n_fh; /* Small File Handle */
+ struct lock n_rslock;
};
#define n_atim n_un1.nf_atim
@@ -157,6 +158,31 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
#if defined(KERNEL) || defined(_KERNEL)
+
+/*
+ * nfs_rslock - Attempt to obtain lock on nfsnode
+ *
+ * Attempt to obtain a lock on the passed nfsnode, returning ENOLCK
+ * if the lock could not be obtained due to our having to sleep. This
+ * function is generally used to lock around code that modifies an
+ * NFS file's size. In order to avoid deadlocks the lock
+ * should not be obtained while other locks are being held.
+ */
+
+static __inline
+int
+nfs_rslock(struct nfsnode *np, struct proc *p)
+{
+ return(lockmgr(&np->n_rslock, LK_EXCLUSIVE | LK_CANRECURSE | LK_SLEEPFAIL, NULL, p));
+}
+
+static __inline
+void
+nfs_rsunlock(struct nfsnode *np, struct proc *p)
+{
+ (void)lockmgr(&np->n_rslock, LK_RELEASE, NULL, p);
+}
+
extern vop_t **fifo_nfsv2nodeop_p;
extern vop_t **nfsv2_vnodeop_p;
extern vop_t **spec_nfsv2nodeop_p;
OpenPOWER on IntegriCloud