summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/kern/vfs_bio.c12
-rw-r--r--sys/nfsclient/nfs.h2
-rw-r--r--sys/nfsclient/nfs_bio.c59
-rw-r--r--sys/nfsclient/nfs_vnops.c23
-rw-r--r--sys/vm/vm_page.c14
-rw-r--r--sys/vm/vnode_pager.c29
6 files changed, 128 insertions, 11 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index f250367..29646d9 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -2271,9 +2271,21 @@ loop:
* to softupdates re-dirtying the buffer. In the latter
* case, B_CACHE is set after the first write completes,
* preventing further loops.
+ * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
+ * above while extending the buffer, we cannot allow the
+ * buffer to remain with B_CACHE set after the write
+ * completes or it will represent a corrupt state. To
+ * deal with this we set B_NOCACHE to scrap the buffer
+ * after the write.
+ *
+ * We might be able to do something fancy, like setting
+ * B_CACHE in bwrite() except if B_DELWRI is already set,
+ * so the below call doesn't set B_CACHE, but that gets real
+ * confusing. This is much easier.
*/
if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
BUF_WRITE(bp);
goto loop;
}
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index 5811883..42db54c 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -288,6 +288,8 @@ void nfs_clearcommit(struct mount *);
int nfs_writebp(struct buf *, int, struct thread *);
int nfs_fsinfo(struct nfsmount *, struct vnode *, struct ucred *,
struct thread *);
+int nfs_meta_setsize (struct vnode *, struct ucred *,
+ struct thread *, u_quad_t);
#endif /* _KERNEL */
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index 754d561..2f3a44b 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -206,8 +206,14 @@ nfs_getpages(struct vop_getpages_args *ap)
vm_page_set_validclean(m, 0, size - toff);
/* handled by vm_fault now */
/* vm_page_zero_invalid(m, TRUE); */
+ } else {
+ /*
+ * Read operation was short. If no error occured
+ * we may have hit a zero-fill section. We simply
+ * leave valid set to 0.
+ */
+ ;
}
-
if (i != ap->a_reqpage) {
/*
* Whether or not to leave the page activated is up in
@@ -831,9 +837,7 @@ again:
else
bcount = np->n_size - (off_t)lbn * biosize;
}
-
bp = nfs_getcacheblk(vp, lbn, bcount, td);
-
if (uio->uio_offset + n > np->n_size) {
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
@@ -1299,11 +1303,13 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
+
switch (vp->v_type) {
case VREG:
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
nfsstats.read_bios++;
error = nfs_readrpc(vp, uiop, cr);
+
if (!error) {
if (uiop->uio_resid) {
/*
@@ -1315,7 +1321,7 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
* writes, but that is not possible any longer.
*/
int nread = bp->b_bcount - uiop->uio_resid;
- int left = bp->b_bcount - nread;
+ int left = uiop->uio_resid;
if (left > 0)
bzero((char *)bp->b_data + nread, left);
@@ -1485,3 +1491,48 @@ nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td)
bufdone(bp);
return (error);
}
+
+/*
+ * Used to aid in handling ftruncate() operations on the NFS client side.
+ * Truncation creates a number of special problems for NFS. We have to
+ * throw away VM pages and buffer cache buffers that are beyond EOF, and
+ * we have to properly handle VM pages or (potentially dirty) buffers
+ * that straddle the truncation point.
+ */
+
+int
+nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
+{
+ struct nfsnode *np = VTONFS(vp);
+ u_quad_t tsize = np->n_size;
+ int biosize = vp->v_mount->mnt_stat.f_iosize;
+ int error = 0;
+
+ np->n_size = nsize;
+
+ if (np->n_size < tsize) {
+ struct buf *bp;
+ daddr_t lbn;
+ int bufsize;
+
+ /*
+ * vtruncbuf() doesn't get the buffer overlapping the
+ * truncation point. We may have a B_DELWRI and/or B_CACHE
+ * buffer that now needs to be truncated.
+ */
+ error = vtruncbuf(vp, cred, td, nsize, biosize);
+ lbn = nsize / biosize;
+ bufsize = nsize & (biosize - 1);
+ bp = nfs_getcacheblk(vp, lbn, bufsize, td);
+ if (bp->b_dirtyoff > bp->b_bcount)
+ bp->b_dirtyoff = bp->b_bcount;
+ if (bp->b_dirtyend > bp->b_bcount)
+ bp->b_dirtyend = bp->b_bcount;
+ bp->b_flags |= B_RELBUF; /* don't leave garbage around */
+ brelse(bp);
+ } else {
+ vnode_pager_setsize(vp, nsize);
+ }
+ return(error);
+}
+
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index ae81f0f..be9f533b 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -643,7 +643,18 @@ nfs_setattr(struct vop_setattr_args *ap)
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
- vnode_pager_setsize(vp, vap->va_size);
+
+ /*
+ * We run vnode_pager_setsize() early (why?),
+ * we must set np->n_size now to avoid vinvalbuf
+ * V_SAVE races that might setsize a lower
+ * value.
+ */
+
+ tsize = np->n_size;
+ error = nfs_meta_setsize(vp, ap->a_cred,
+ ap->a_td, vap->va_size);
+
if (np->n_flag & NMODIFIED) {
if (vap->va_size == 0)
error = nfs_vinvalbuf(vp, 0,
@@ -656,8 +667,7 @@ nfs_setattr(struct vop_setattr_args *ap)
return (error);
}
}
- tsize = np->n_size;
- np->n_size = np->n_vattr.va_size = vap->va_size;
+ np->n_vattr.va_size = vap->va_size;
};
} else if ((vap->va_mtime.tv_sec != VNOVAL ||
vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
@@ -1049,10 +1059,12 @@ nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
m_freem(mrep);
tsiz -= retlen;
if (v3) {
- if (eof || retlen == 0)
+ if (eof || retlen == 0) {
tsiz = 0;
- } else if (retlen < len)
+ }
+ } else if (retlen < len) {
tsiz = 0;
+ }
}
nfsmout:
return (error);
@@ -3114,3 +3126,4 @@ nfsfifo_close(struct vop_close_args *ap)
}
return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap));
}
+
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 0eb06fc..abc4194 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1630,10 +1630,24 @@ vm_page_set_validclean(vm_page_t m, int base, int size)
* use this opportunity to clear the PG_NOSYNC flag. If a process
* takes a write fault on a MAP_NOSYNC memory area the flag will
* be set again.
+ *
+ * We set valid bits inclusive of any overlap, but we can only
+ * clear dirty bits for DEV_BSIZE chunks that are fully within
+ * the range.
*/
pagebits = vm_page_bits(base, size);
m->valid |= pagebits;
+#if 0 /* NOT YET */
+ if ((frag = base & (DEV_BSIZE - 1)) != 0) {
+ frag = DEV_BSIZE - frag;
+ base += frag;
+ size -= frag;
+ if (size < 0)
+ size = 0;
+ }
+ pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
+#endif
m->dirty &= ~pagebits;
if (base == 0 && size == PAGE_SIZE) {
pmap_clear_modify(m);
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 9e6363b..b0348c6 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -298,14 +298,18 @@ vnode_pager_setsize(vp, nsize)
}
/*
* this gets rid of garbage at the end of a page that is now
- * only partially backed by the vnode...
+ * only partially backed by the vnode.
+ *
+ * XXX for some reason (I don't know yet), if we take a
+ * completely invalid page and mark it partially valid
+ * it can screw up NFS reads, so we don't allow the case.
*/
if (nsize & PAGE_MASK) {
vm_offset_t kva;
vm_page_t m;
m = vm_page_lookup(object, OFF_TO_IDX(nsize));
- if (m) {
+ if (m && m->valid) {
int base = (int)nsize & PAGE_MASK;
int size = PAGE_SIZE - base;
@@ -318,6 +322,20 @@ vnode_pager_setsize(vp, nsize)
vm_pager_unmap_page(kva);
/*
+ * XXX work around SMP data integrity race
+ * by unmapping the page from user processes.
+ * The garbage we just cleared may be mapped
+ * to a user process running on another cpu
+ * and this code is not running through normal
+ * I/O channels which handle SMP issues for
+ * us, so unmap page to synchronize all cpus.
+ *
+ * XXX should vm_pager_unmap_page() have
+ * dealt with this?
+ */
+ vm_page_protect(m, VM_PROT_NONE);
+
+ /*
* Clear out partial-page dirty bits. This
* has the side effect of setting the valid
* bits, but that is ok. There are a bunch
@@ -325,6 +343,10 @@ vnode_pager_setsize(vp, nsize)
* m->dirty == VM_PAGE_BITS_ALL. The file EOF
* case is one of them. If the page is still
* partially dirty, make it fully dirty.
+ *
+ * note that we do not clear out the valid
+ * bits. This would prevent bogus_page
+ * replacement from working properly.
*/
vm_page_set_validclean(m, base, size);
if (m->dirty != 0)
@@ -965,6 +987,9 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
* may not properly clear the dirty bits for the entire page (which
* could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
* With the page locked we are free to fix-up the dirty bits here.
+ *
+ * We do not under any circumstances truncate the valid bits, as
+ * this will screw up bogus page replacement.
*/
if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
if (object->un_pager.vnp.vnp_size > poffset) {
OpenPOWER on IntegriCloud