diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/nfs/nfs_bio.c | 168 | ||||
-rw-r--r-- | sys/nfs/nfs_node.c | 1 | ||||
-rw-r--r-- | sys/nfs/nfsnode.h | 26 | ||||
-rw-r--r-- | sys/nfsclient/nfs_bio.c | 168 | ||||
-rw-r--r-- | sys/nfsclient/nfs_node.c | 1 | ||||
-rw-r--r-- | sys/nfsclient/nfsnode.h | 26 |
6 files changed, 298 insertions, 92 deletions
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index 10be58d..8e99d98 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -480,20 +480,30 @@ nfs_bioread(vp, uio, ioflag, cred) /* * Obtain the buffer cache block. Figure out the buffer size - * when we are at EOF. nfs_getcacheblk() will also force - * uncached delayed-writes to be flushed to the server. + * when we are at EOF. If we are modifying the size of the + * buffer based on an EOF condition we need to hold + * nfs_rslock() through obtaining the buffer to prevent + * a potential writer-appender from messing with n_size. + * Otherwise we may accidently truncate the buffer and + * lose dirty data. * * Note that bcount is *not* DEV_BSIZE aligned. */ +again: bcount = biosize; if ((off_t)lbn * biosize >= np->n_size) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > np->n_size) { bcount = np->n_size - (off_t)lbn * biosize; } + if (bcount != biosize && nfs_rslock(np, p) == ENOLCK) + goto again; bp = nfs_getcacheblk(vp, lbn, bcount, p); + + if (bcount != biosize) + nfs_rsunlock(np, p); if (!bp) return (EINTR); @@ -708,6 +718,7 @@ nfs_write(ap) daddr_t lbn; int bcount; int n, on, error = 0, iomode, must_commit; + int haverslock = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) @@ -724,6 +735,11 @@ nfs_write(ap) if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); + + /* + * Synchronously flush pending buffers if we are in synchronous + * mode or if we are appending. + */ if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { np->n_attrstamp = 0; @@ -731,20 +747,49 @@ nfs_write(ap) if (error) return (error); } - if (ioflag & IO_APPEND) { - np->n_attrstamp = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) - return (error); - uio->uio_offset = np->n_size; - } } + + /* + * If IO_APPEND then load uio_offset. We restart here if we cannot + * get the append lock. + */ +restart: + if (ioflag & IO_APPEND) { + np->n_attrstamp = 0; + error = VOP_GETATTR(vp, &vattr, cred, p); + if (error) + return (error); + uio->uio_offset = np->n_size; + } + if (uio->uio_offset < 0) return (EINVAL); if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); if (uio->uio_resid == 0) return (0); + + /* + * We need to obtain the rslock if we intend to modify np->n_size + * in order to guarentee the append point with multiple contending + * writers, to guarentee that no other appenders modify n_size + * while we are trying to obtain a truncated buffer (i.e. to avoid + * accidently truncating data written by another appender due to + * the race), and to ensure that the buffer is populated prior to + * our extending of the file. We hold rslock through the entire + * operation. + * + * Note that we do not synchronize the case where someone truncates + * the file while we are appending to it because attempting to lock + * this case may deadlock other parts of the system unexpectedly. + */ + if ((ioflag & IO_APPEND) || + uio->uio_offset + uio->uio_resid > np->n_size) { + if (nfs_rslock(np, p) == ENOLCK) + goto restart; + haverslock = 1; + } + /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, i don't think it matters @@ -752,6 +797,8 @@ nfs_write(ap) if (p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(p, SIGXFSZ); + if (haverslock) + nfs_rsunlock(np, p); return (EFBIG); } @@ -767,12 +814,12 @@ nfs_write(ap) error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) - return (error); + break; if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) - return (error); + break; np->n_brev = np->n_lrev; } } @@ -780,8 +827,8 @@ nfs_write(ap) iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); if (must_commit) - nfs_clearcommit(vp->v_mount); - return (error); + nfs_clearcommit(vp->v_mount); + break; } nfsstats.biocache_writes++; lbn = uio->uio_offset / biosize; @@ -795,36 +842,51 @@ again: if (uio->uio_offset == np->n_size && n) { /* - * special append case. Obtain buffer prior to - * resizing it to maintain B_CACHE. + * Get the buffer (in its pre-append state to maintain + * B_CACHE if it was previously set). Resize the + * nfsnode after we have locked the buffer to prevent + * readers from reading garbage. */ - long save; - bcount = on; bp = nfs_getcacheblk(vp, lbn, bcount, p); - if (!bp) - return (EINTR); - save = bp->b_flags & B_CACHE; - np->n_size = uio->uio_offset + n; - np->n_flag |= NMODIFIED; - vnode_pager_setsize(vp, np->n_size); + if (bp != NULL) { + long save; + + np->n_size = uio->uio_offset + n; + np->n_flag |= NMODIFIED; + vnode_pager_setsize(vp, np->n_size); - bcount += n; - allocbuf(bp, bcount); - bp->b_flags |= save; + save = bp->b_flags & B_CACHE; + bcount += n; + allocbuf(bp, bcount); + bp->b_flags |= save; + } } else { + /* + * Obtain the locked cache block first, and then + * adjust the file's size as appropriate. + */ + bcount = on + n; + if ((off_t)lbn * biosize + bcount < np->n_size) { + if ((off_t)(lbn + 1) * biosize < np->n_size) + bcount = biosize; + else + bcount = np->n_size - (off_t)lbn * biosize; + } + + bp = nfs_getcacheblk(vp, lbn, bcount, p); + if (uio->uio_offset + n > np->n_size) { np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); } - bcount = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size) - bcount = np->n_size - (off_t)lbn * biosize; - bp = nfs_getcacheblk(vp, lbn, bcount, p); - if (!bp) - return (EINTR); + } + + if (!bp) { + error = EINTR; + break; } /* @@ -857,11 +919,13 @@ again: error = nfs_doio(bp, cred, p); if (error) { brelse(bp); - return (error); + break; } } - if (!bp) - return (EINTR); + if (!bp) { + error = EINTR; + break; + } if (bp->b_wcred == NOCRED) { crhold(cred); bp->b_wcred = cred; @@ -869,13 +933,21 @@ again: np->n_flag |= NMODIFIED; /* - * If dirtyend exceeds file size, chop it down. If this - * creates a reverse-indexed or degenerate situation with - * dirtyoff/end, 0 them. + * If dirtyend exceeds file size, chop it down. This should + * not normally occur but there is an append race where it + * might occur XXX, so we log it. + * + * If the chopping creates a reverse-indexed or degenerate + * situation with dirtyoff/end, we 0 both of them. */ - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; + if (bp->b_dirtyend > bcount) { + printf("NFS append race @%lx:%d\n", + (long)bp->b_blkno * DEV_BSIZE, + bp->b_dirtyend - bcount); + bp->b_dirtyend = bcount; + } + if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; @@ -914,14 +986,14 @@ again: } while (error == NQNFS_EXPIRED); if (error) { brelse(bp); - return (error); + break; } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { brelse(bp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) - return (error); + break; np->n_brev = np->n_lrev; goto again; } @@ -940,7 +1012,7 @@ again: if (error) { bp->b_flags |= B_ERROR; brelse(bp); - return (error); + break; } /* @@ -969,11 +1041,11 @@ again: bp->b_flags |= B_NOCACHE; error = VOP_BWRITE(bp->b_vp, bp); if (error) - return (error); + break; if (np->n_flag & NQNFSNONCACHE) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) - return (error); + break; } } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { @@ -983,7 +1055,11 @@ again: bdwrite(bp); } } while (uio->uio_resid > 0 && n > 0); - return (0); + + if (haverslock) + nfs_rsunlock(np, p); + + return (error); } /* diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c index 53c73a1..4db6fc2 100644 --- a/sys/nfs/nfs_node.c +++ b/sys/nfs/nfs_node.c @@ -180,6 +180,7 @@ loop: np->n_fhp = &np->n_fh; bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); np->n_fhsize = fhsize; + lockinit(&np->n_rslock, PVFS, "nfrslk", 0, LK_NOPAUSE); *npp = np; if (nfs_node_hash_lock < 0) diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h index 3bc818f..1ca4529 100644 --- a/sys/nfs/nfsnode.h +++ b/sys/nfs/nfsnode.h @@ -118,6 +118,7 @@ struct nfsnode { short n_fhsize; /* size in bytes, of fh */ short n_flag; /* Flag for locking.. */ nfsfh_t n_fh; /* Small File Handle */ + struct lock n_rslock; }; #define n_atim n_un1.nf_atim @@ -157,6 +158,31 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; #if defined(KERNEL) || defined(_KERNEL) + +/* + * nfs_rslock - Attempt to obtain lock on nfsnode + * + * Attempt to obtain a lock on the passed nfsnode, returning ENOLCK + * if the lock could not be obtained due to our having to sleep. This + * function is generally used to lock around code that modifies an + * NFS file's size. In order to avoid deadlocks the lock + * should not be obtained while other locks are being held. + */ + +static __inline +int +nfs_rslock(struct nfsnode *np, struct proc *p) +{ + return(lockmgr(&np->n_rslock, LK_EXCLUSIVE | LK_CANRECURSE | LK_SLEEPFAIL, NULL, p)); +} + +static __inline +void +nfs_rsunlock(struct nfsnode *np, struct proc *p) +{ + (void)lockmgr(&np->n_rslock, LK_RELEASE, NULL, p); +} + extern vop_t **fifo_nfsv2nodeop_p; extern vop_t **nfsv2_vnodeop_p; extern vop_t **spec_nfsv2nodeop_p; diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index 10be58d..8e99d98 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -480,20 +480,30 @@ nfs_bioread(vp, uio, ioflag, cred) /* * Obtain the buffer cache block. Figure out the buffer size - * when we are at EOF. nfs_getcacheblk() will also force - * uncached delayed-writes to be flushed to the server. + * when we are at EOF. If we are modifying the size of the + * buffer based on an EOF condition we need to hold + * nfs_rslock() through obtaining the buffer to prevent + * a potential writer-appender from messing with n_size. + * Otherwise we may accidently truncate the buffer and + * lose dirty data. * * Note that bcount is *not* DEV_BSIZE aligned. */ +again: bcount = biosize; if ((off_t)lbn * biosize >= np->n_size) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > np->n_size) { bcount = np->n_size - (off_t)lbn * biosize; } + if (bcount != biosize && nfs_rslock(np, p) == ENOLCK) + goto again; bp = nfs_getcacheblk(vp, lbn, bcount, p); + + if (bcount != biosize) + nfs_rsunlock(np, p); if (!bp) return (EINTR); @@ -708,6 +718,7 @@ nfs_write(ap) daddr_t lbn; int bcount; int n, on, error = 0, iomode, must_commit; + int haverslock = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) @@ -724,6 +735,11 @@ nfs_write(ap) if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); + + /* + * Synchronously flush pending buffers if we are in synchronous + * mode or if we are appending. + */ if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { np->n_attrstamp = 0; @@ -731,20 +747,49 @@ nfs_write(ap) if (error) return (error); } - if (ioflag & IO_APPEND) { - np->n_attrstamp = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) - return (error); - uio->uio_offset = np->n_size; - } } + + /* + * If IO_APPEND then load uio_offset. We restart here if we cannot + * get the append lock. + */ +restart: + if (ioflag & IO_APPEND) { + np->n_attrstamp = 0; + error = VOP_GETATTR(vp, &vattr, cred, p); + if (error) + return (error); + uio->uio_offset = np->n_size; + } + if (uio->uio_offset < 0) return (EINVAL); if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); if (uio->uio_resid == 0) return (0); + + /* + * We need to obtain the rslock if we intend to modify np->n_size + * in order to guarentee the append point with multiple contending + * writers, to guarentee that no other appenders modify n_size + * while we are trying to obtain a truncated buffer (i.e. to avoid + * accidently truncating data written by another appender due to + * the race), and to ensure that the buffer is populated prior to + * our extending of the file. We hold rslock through the entire + * operation. + * + * Note that we do not synchronize the case where someone truncates + * the file while we are appending to it because attempting to lock + * this case may deadlock other parts of the system unexpectedly. + */ + if ((ioflag & IO_APPEND) || + uio->uio_offset + uio->uio_resid > np->n_size) { + if (nfs_rslock(np, p) == ENOLCK) + goto restart; + haverslock = 1; + } + /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, i don't think it matters @@ -752,6 +797,8 @@ nfs_write(ap) if (p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(p, SIGXFSZ); + if (haverslock) + nfs_rsunlock(np, p); return (EFBIG); } @@ -767,12 +814,12 @@ nfs_write(ap) error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) - return (error); + break; if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) - return (error); + break; np->n_brev = np->n_lrev; } } @@ -780,8 +827,8 @@ nfs_write(ap) iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); if (must_commit) - nfs_clearcommit(vp->v_mount); - return (error); + nfs_clearcommit(vp->v_mount); + break; } nfsstats.biocache_writes++; lbn = uio->uio_offset / biosize; @@ -795,36 +842,51 @@ again: if (uio->uio_offset == np->n_size && n) { /* - * special append case. Obtain buffer prior to - * resizing it to maintain B_CACHE. + * Get the buffer (in its pre-append state to maintain + * B_CACHE if it was previously set). Resize the + * nfsnode after we have locked the buffer to prevent + * readers from reading garbage. */ - long save; - bcount = on; bp = nfs_getcacheblk(vp, lbn, bcount, p); - if (!bp) - return (EINTR); - save = bp->b_flags & B_CACHE; - np->n_size = uio->uio_offset + n; - np->n_flag |= NMODIFIED; - vnode_pager_setsize(vp, np->n_size); + if (bp != NULL) { + long save; + + np->n_size = uio->uio_offset + n; + np->n_flag |= NMODIFIED; + vnode_pager_setsize(vp, np->n_size); - bcount += n; - allocbuf(bp, bcount); - bp->b_flags |= save; + save = bp->b_flags & B_CACHE; + bcount += n; + allocbuf(bp, bcount); + bp->b_flags |= save; + } } else { + /* + * Obtain the locked cache block first, and then + * adjust the file's size as appropriate. + */ + bcount = on + n; + if ((off_t)lbn * biosize + bcount < np->n_size) { + if ((off_t)(lbn + 1) * biosize < np->n_size) + bcount = biosize; + else + bcount = np->n_size - (off_t)lbn * biosize; + } + + bp = nfs_getcacheblk(vp, lbn, bcount, p); + if (uio->uio_offset + n > np->n_size) { np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); } - bcount = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size) - bcount = np->n_size - (off_t)lbn * biosize; - bp = nfs_getcacheblk(vp, lbn, bcount, p); - if (!bp) - return (EINTR); + } + + if (!bp) { + error = EINTR; + break; } /* @@ -857,11 +919,13 @@ again: error = nfs_doio(bp, cred, p); if (error) { brelse(bp); - return (error); + break; } } - if (!bp) - return (EINTR); + if (!bp) { + error = EINTR; + break; + } if (bp->b_wcred == NOCRED) { crhold(cred); bp->b_wcred = cred; @@ -869,13 +933,21 @@ again: np->n_flag |= NMODIFIED; /* - * If dirtyend exceeds file size, chop it down. If this - * creates a reverse-indexed or degenerate situation with - * dirtyoff/end, 0 them. + * If dirtyend exceeds file size, chop it down. This should + * not normally occur but there is an append race where it + * might occur XXX, so we log it. + * + * If the chopping creates a reverse-indexed or degenerate + * situation with dirtyoff/end, we 0 both of them. */ - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; + if (bp->b_dirtyend > bcount) { + printf("NFS append race @%lx:%d\n", + (long)bp->b_blkno * DEV_BSIZE, + bp->b_dirtyend - bcount); + bp->b_dirtyend = bcount; + } + if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; @@ -914,14 +986,14 @@ again: } while (error == NQNFS_EXPIRED); if (error) { brelse(bp); - return (error); + break; } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { brelse(bp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) - return (error); + break; np->n_brev = np->n_lrev; goto again; } @@ -940,7 +1012,7 @@ again: if (error) { bp->b_flags |= B_ERROR; brelse(bp); - return (error); + break; } /* @@ -969,11 +1041,11 @@ again: bp->b_flags |= B_NOCACHE; error = VOP_BWRITE(bp->b_vp, bp); if (error) - return (error); + break; if (np->n_flag & NQNFSNONCACHE) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) - return (error); + break; } } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { @@ -983,7 +1055,11 @@ again: bdwrite(bp); } } while (uio->uio_resid > 0 && n > 0); - return (0); + + if (haverslock) + nfs_rsunlock(np, p); + + return (error); } /* diff --git a/sys/nfsclient/nfs_node.c b/sys/nfsclient/nfs_node.c index 53c73a1..4db6fc2 100644 --- a/sys/nfsclient/nfs_node.c +++ b/sys/nfsclient/nfs_node.c @@ -180,6 +180,7 @@ loop: np->n_fhp = &np->n_fh; bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); np->n_fhsize = fhsize; + lockinit(&np->n_rslock, PVFS, "nfrslk", 0, LK_NOPAUSE); *npp = np; if (nfs_node_hash_lock < 0) diff --git a/sys/nfsclient/nfsnode.h b/sys/nfsclient/nfsnode.h index 3bc818f..1ca4529 100644 --- a/sys/nfsclient/nfsnode.h +++ b/sys/nfsclient/nfsnode.h @@ -118,6 +118,7 @@ struct nfsnode { short n_fhsize; /* size in bytes, of fh */ short n_flag; /* Flag for locking.. */ nfsfh_t n_fh; /* Small File Handle */ + struct lock n_rslock; }; #define n_atim n_un1.nf_atim @@ -157,6 +158,31 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; #if defined(KERNEL) || defined(_KERNEL) + +/* + * nfs_rslock - Attempt to obtain lock on nfsnode + * + * Attempt to obtain a lock on the passed nfsnode, returning ENOLCK + * if the lock could not be obtained due to our having to sleep. This + * function is generally used to lock around code that modifies an + * NFS file's size. In order to avoid deadlocks the lock + * should not be obtained while other locks are being held. + */ + +static __inline +int +nfs_rslock(struct nfsnode *np, struct proc *p) +{ + return(lockmgr(&np->n_rslock, LK_EXCLUSIVE | LK_CANRECURSE | LK_SLEEPFAIL, NULL, p)); +} + +static __inline +void +nfs_rsunlock(struct nfsnode *np, struct proc *p) +{ + (void)lockmgr(&np->n_rslock, LK_RELEASE, NULL, p); +} + extern vop_t **fifo_nfsv2nodeop_p; extern vop_t **nfsv2_vnodeop_p; extern vop_t **spec_nfsv2nodeop_p; |