summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2001-12-14 01:16:57 +0000
committerdillon <dillon@FreeBSD.org>2001-12-14 01:16:57 +0000
commitcd4d323ad300ef689d4b7dae113791a0f82ee65e (patch)
tree41ee3cc72a843eef0d91fff1ed8c59b176341b5b /sys
parent637ec33540e81a939866bcbced0bbbff709ae333 (diff)
downloadFreeBSD-src-cd4d323ad300ef689d4b7dae113791a0f82ee65e.zip
FreeBSD-src-cd4d323ad300ef689d4b7dae113791a0f82ee65e.tar.gz
This fixes a large number of bugs in our NFS client side code. A recent
commit by Kirk also fixed a softupdates bug that could easily be triggered by server side NFS. * An edge case with shared R+W mmap()'s and truncate whereby the system would inappropriately clear the dirty bits on still-dirty data. (applicable to all filesystems) THIS FIX TEMPORARILY DISABLED PENDING FURTHER TESTING. see vm/vm_page.c line 1641 * The straddle case for VM pages and buffer cache buffers when truncating. (applicable to NFS client side) * Possible SMP database corruption due to vm_pager_unmap_page() not clearing the TLB for the other cpu's. (applicable to NFS client side but could effect all filesystems). Note: not considered serious since the corruption occurs beyond the file EOF. * When flusing a dirty buffer due to B_CACHE getting cleared, we were accidently setting B_CACHE again (that is, bwrite() sets B_CACHE), when we really want it to stay clear after the write is complete. This resulted in a corrupt buffer. (applicable to all filesystems but probably only triggered by NFS) * We have to call vtruncbuf() when ftruncate()ing to remove any buffer cache buffers. This is still tentitive, I may be able to remove it due to the second bug fix. (applicable to NFS client side) * vnode_pager_setsize() race against nfs_vinvalbuf()... we have to set n_size before calling nfs_vinvalbuf or the NFS code may recursively vnode_pager_setsize() to the original value before the truncate. This is what was causing the user mmap bus faults in the nfs tester program. (applicable to NFS client side) * Fix to softupdates (see ufs/ffs/ffs_inode.c 1.73, commit made by Kirk). Testing program written by: Avadis Tevanian, Jr. Testing program supplied by: jkh / Apple (see Dec2001 posting to freebsd-hackers with Subject 'NFS: How to make FreeBS fall on its face in one easy step') MFC after: 1 week
Diffstat (limited to 'sys')
-rw-r--r--sys/kern/vfs_bio.c12
-rw-r--r--sys/nfsclient/nfs.h2
-rw-r--r--sys/nfsclient/nfs_bio.c59
-rw-r--r--sys/nfsclient/nfs_vnops.c23
-rw-r--r--sys/vm/vm_page.c14
-rw-r--r--sys/vm/vnode_pager.c29
6 files changed, 128 insertions, 11 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index f250367..29646d9 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -2271,9 +2271,21 @@ loop:
* to softupdates re-dirtying the buffer. In the latter
* case, B_CACHE is set after the first write completes,
* preventing further loops.
+ * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
+ * above while extending the buffer, we cannot allow the
+ * buffer to remain with B_CACHE set after the write
+ * completes or it will represent a corrupt state. To
+ * deal with this we set B_NOCACHE to scrap the buffer
+ * after the write.
+ *
+ * We might be able to do something fancy, like setting
+ * B_CACHE in bwrite() except if B_DELWRI is already set,
+ * so the below call doesn't set B_CACHE, but that gets real
+ * confusing. This is much easier.
*/
if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
BUF_WRITE(bp);
goto loop;
}
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index 5811883..42db54c 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -288,6 +288,8 @@ void nfs_clearcommit(struct mount *);
int nfs_writebp(struct buf *, int, struct thread *);
int nfs_fsinfo(struct nfsmount *, struct vnode *, struct ucred *,
struct thread *);
+int nfs_meta_setsize (struct vnode *, struct ucred *,
+ struct thread *, u_quad_t);
#endif /* _KERNEL */
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index 754d561..2f3a44b 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -206,8 +206,14 @@ nfs_getpages(struct vop_getpages_args *ap)
vm_page_set_validclean(m, 0, size - toff);
/* handled by vm_fault now */
/* vm_page_zero_invalid(m, TRUE); */
+ } else {
+ /*
+ * Read operation was short. If no error occured
+ * we may have hit a zero-fill section. We simply
+ * leave valid set to 0.
+ */
+ ;
}
-
if (i != ap->a_reqpage) {
/*
* Whether or not to leave the page activated is up in
@@ -831,9 +837,7 @@ again:
else
bcount = np->n_size - (off_t)lbn * biosize;
}
-
bp = nfs_getcacheblk(vp, lbn, bcount, td);
-
if (uio->uio_offset + n > np->n_size) {
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
@@ -1299,11 +1303,13 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
+
switch (vp->v_type) {
case VREG:
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
nfsstats.read_bios++;
error = nfs_readrpc(vp, uiop, cr);
+
if (!error) {
if (uiop->uio_resid) {
/*
@@ -1315,7 +1321,7 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
* writes, but that is not possible any longer.
*/
int nread = bp->b_bcount - uiop->uio_resid;
- int left = bp->b_bcount - nread;
+ int left = uiop->uio_resid;
if (left > 0)
bzero((char *)bp->b_data + nread, left);
@@ -1485,3 +1491,48 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
bufdone(bp);
return (error);
}
+
+/*
+ * Used to aid in handling ftruncate() operations on the NFS client side.
+ * Truncation creates a number of special problems for NFS. We have to
+ * throw away VM pages and buffer cache buffers that are beyond EOF, and
+ * we have to properly handle VM pages or (potentially dirty) buffers
+ * that straddle the truncation point.
+ */
+
+int
+nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
+{
+ struct nfsnode *np = VTONFS(vp);
+ u_quad_t tsize = np->n_size;
+ int biosize = vp->v_mount->mnt_stat.f_iosize;
+ int error = 0;
+
+ np->n_size = nsize;
+
+ if (np->n_size < tsize) {
+ struct buf *bp;
+ daddr_t lbn;
+ int bufsize;
+
+ /*
+ * vtruncbuf() doesn't get the buffer overlapping the
+ * truncation point. We may have a B_DELWRI and/or B_CACHE
+ * buffer that now needs to be truncated.
+ */
+ error = vtruncbuf(vp, cred, td, nsize, biosize);
+ lbn = nsize / biosize;
+ bufsize = nsize & (biosize - 1);
+ bp = nfs_getcacheblk(vp, lbn, bufsize, td);
+ if (bp->b_dirtyoff > bp->b_bcount)
+ bp->b_dirtyoff = bp->b_bcount;
+ if (bp->b_dirtyend > bp->b_bcount)
+ bp->b_dirtyend = bp->b_bcount;
+ bp->b_flags |= B_RELBUF; /* don't leave garbage around */
+ brelse(bp);
+ } else {
+ vnode_pager_setsize(vp, nsize);
+ }
+ return(error);
+}
+
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index ae81f0f..be9f533b 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -643,7 +643,18 @@ nfs_setattr(struct vop_setattr_args *ap)
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
- vnode_pager_setsize(vp, vap->va_size);
+
+ /*
+ * We run vnode_pager_setsize() early (why?),
+ * we must set np->n_size now to avoid vinvalbuf
+ * V_SAVE races that might setsize a lower
+ * value.
+ */
+
+ tsize = np->n_size;
+ error = nfs_meta_setsize(vp, ap->a_cred,
+ ap->a_td, vap->va_size);
+
if (np->n_flag & NMODIFIED) {
if (vap->va_size == 0)
error = nfs_vinvalbuf(vp, 0,
@@ -656,8 +667,7 @@ nfs_setattr(struct vop_setattr_args *ap)
return (error);
}
}
- tsize = np->n_size;
- np->n_size = np->n_vattr.va_size = vap->va_size;
+ np->n_vattr.va_size = vap->va_size;
};
} else if ((vap->va_mtime.tv_sec != VNOVAL ||
vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
@@ -1049,10 +1059,12 @@ nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
m_freem(mrep);
tsiz -= retlen;
if (v3) {
- if (eof || retlen == 0)
+ if (eof || retlen == 0) {
tsiz = 0;
- } else if (retlen < len)
+ }
+ } else if (retlen < len) {
tsiz = 0;
+ }
}
nfsmout:
return (error);
@@ -3114,3 +3126,4 @@ nfsfifo_close(struct vop_close_args *ap)
}
return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap));
}
+
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 0eb06fc..abc4194 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1630,10 +1630,24 @@ vm_page_set_validclean(vm_page_t m, int base, int size)
* use this opportunity to clear the PG_NOSYNC flag. If a process
* takes a write fault on a MAP_NOSYNC memory area the flag will
* be set again.
+ *
+ * We set valid bits inclusive of any overlap, but we can only
+ * clear dirty bits for DEV_BSIZE chunks that are fully within
+ * the range.
*/
pagebits = vm_page_bits(base, size);
m->valid |= pagebits;
+#if 0 /* NOT YET */
+ if ((frag = base & (DEV_BSIZE - 1)) != 0) {
+ frag = DEV_BSIZE - frag;
+ base += frag;
+ size -= frag;
+ if (size < 0)
+ size = 0;
+ }
+ pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
+#endif
m->dirty &= ~pagebits;
if (base == 0 && size == PAGE_SIZE) {
pmap_clear_modify(m);
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 9e6363b..b0348c6 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -298,14 +298,18 @@ vnode_pager_setsize(vp, nsize)
}
/*
* this gets rid of garbage at the end of a page that is now
- * only partially backed by the vnode...
+ * only partially backed by the vnode.
+ *
+ * XXX for some reason (I don't know yet), if we take a
+ * completely invalid page and mark it partially valid
+ * it can screw up NFS reads, so we don't allow the case.
*/
if (nsize & PAGE_MASK) {
vm_offset_t kva;
vm_page_t m;
m = vm_page_lookup(object, OFF_TO_IDX(nsize));
- if (m) {
+ if (m && m->valid) {
int base = (int)nsize & PAGE_MASK;
int size = PAGE_SIZE - base;
@@ -318,6 +322,20 @@ vnode_pager_setsize(vp, nsize)
vm_pager_unmap_page(kva);
/*
+ * XXX work around SMP data integrity race
+ * by unmapping the page from user processes.
+ * The garbage we just cleared may be mapped
+ * to a user process running on another cpu
+ * and this code is not running through normal
+ * I/O channels which handle SMP issues for
+ * us, so unmap page to synchronize all cpus.
+ *
+ * XXX should vm_pager_unmap_page() have
+ * dealt with this?
+ */
+ vm_page_protect(m, VM_PROT_NONE);
+
+ /*
* Clear out partial-page dirty bits. This
* has the side effect of setting the valid
* bits, but that is ok. There are a bunch
@@ -325,6 +343,10 @@ vnode_pager_setsize(vp, nsize)
* m->dirty == VM_PAGE_BITS_ALL. The file EOF
* case is one of them. If the page is still
* partially dirty, make it fully dirty.
+ *
+ * note that we do not clear out the valid
+ * bits. This would prevent bogus_page
+ * replacement from working properly.
*/
vm_page_set_validclean(m, base, size);
if (m->dirty != 0)
@@ -965,6 +987,9 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
* may not properly clear the dirty bits for the entire page (which
* could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
* With the page locked we are free to fix-up the dirty bits here.
+ *
+ * We do not under any circumstances truncate the valid bits, as
+ * this will screw up bogus page replacement.
*/
if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
if (object->un_pager.vnp.vnp_size > poffset) {
OpenPOWER on IntegriCloud