summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/gnu/ext2fs/ext2_bmap.c3
-rw-r--r--sys/gnu/fs/ext2fs/ext2_bmap.c3
-rw-r--r--sys/kern/vfs_bio.c830
-rw-r--r--sys/kern/vfs_cluster.c10
-rw-r--r--sys/kern/vfs_default.c12
-rw-r--r--sys/nfs/nfs.h5
-rw-r--r--sys/nfs/nfs_bio.c413
-rw-r--r--sys/nfs/nfs_nqlease.c6
-rw-r--r--sys/nfs/nfs_socket.c136
-rw-r--r--sys/nfs/nfs_vnops.c38
-rw-r--r--sys/nfsclient/nfs.h5
-rw-r--r--sys/nfsclient/nfs_bio.c413
-rw-r--r--sys/nfsclient/nfs_socket.c136
-rw-r--r--sys/nfsclient/nfs_vnops.c38
-rw-r--r--sys/nfsclient/nfsargs.h5
-rw-r--r--sys/nfsclient/nfsstats.h5
-rw-r--r--sys/nfsserver/nfs.h5
-rw-r--r--sys/nfsserver/nfs_srvsock.c136
-rw-r--r--sys/nfsserver/nfsrvstats.h5
-rw-r--r--sys/sys/bio.h40
-rw-r--r--sys/sys/buf.h40
-rw-r--r--sys/ufs/ffs/ffs_inode.c3
-rw-r--r--sys/ufs/mfs/mfs_vnops.c5
-rw-r--r--sys/ufs/ufs/ufs_bmap.c3
-rw-r--r--sys/vm/vm_fault.c30
-rw-r--r--sys/vm/vm_page.c51
-rw-r--r--sys/vm/vm_page.h8
-rw-r--r--sys/vm/vm_pager.c5
-rw-r--r--sys/vm/vm_pager.h18
-rw-r--r--sys/vm/vnode_pager.c5
30 files changed, 1414 insertions, 998 deletions
diff --git a/sys/gnu/ext2fs/ext2_bmap.c b/sys/gnu/ext2fs/ext2_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/gnu/ext2fs/ext2_bmap.c
+++ b/sys/gnu/ext2fs/ext2_bmap.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
*/
#include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
#endif
bp->b_blkno = blkptrtodb(ump, daddr);
bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_INVAL|B_ERROR);
vfs_busy_pages(bp, 0);
VOP_STRATEGY(bp->b_vp, bp);
curproc->p_stats->p_ru.ru_inblock++; /* XXX */
diff --git a/sys/gnu/fs/ext2fs/ext2_bmap.c b/sys/gnu/fs/ext2fs/ext2_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/gnu/fs/ext2fs/ext2_bmap.c
+++ b/sys/gnu/fs/ext2fs/ext2_bmap.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
*/
#include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
#endif
bp->b_blkno = blkptrtodb(ump, daddr);
bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_INVAL|B_ERROR);
vfs_busy_pages(bp, 0);
VOP_STRATEGY(bp->b_vp, bp);
curproc->p_stats->p_ru.ru_inblock++; /* XXX */
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 803aab1..cb18320 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
* 2. Absolutely no warranty of function or purpose is made by the author
* John S. Dyson.
*
- * $Id: vfs_bio.c,v 1.206 1999/04/14 18:51:52 dt Exp $
+ * $Id: vfs_bio.c,v 1.207 1999/04/29 18:15:25 alc Exp $
*/
/*
@@ -74,9 +74,6 @@ static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
vm_offset_t to);
static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
vm_offset_t to);
-static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
- vm_offset_t off, vm_offset_t size,
- vm_page_t m);
static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
int pageno, vm_page_t m);
static void vfs_clean_pages(struct buf * bp);
@@ -222,6 +219,27 @@ bufcountwakeup(void)
}
/*
+ * vfs_buf_test_cache:
+ *
+ * Called when a buffer is extended. This function clears the B_CACHE
+ * bit if the newly extended portion of the buffer does not contain
+ * valid data.
+ */
+static __inline__
+void
+vfs_buf_test_cache(struct buf *bp,
+ vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+ vm_page_t m)
+{
+ if (bp->b_flags & B_CACHE) {
+ int base = (foff + off) & PAGE_MASK;
+ if (vm_page_is_valid(m, base, size) == 0)
+ bp->b_flags &= ~B_CACHE;
+ }
+}
+
+
+/*
* Initialize buffer headers and related structures.
*/
void
@@ -371,7 +389,10 @@ bremfree(struct buf * bp)
/*
- * Get a buffer with the specified data. Look in the cache first.
+ * Get a buffer with the specified data. Look in the cache first. We
+ * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything ( see
+ * getblk() ).
*/
int
bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
@@ -388,7 +409,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
curproc->p_stats->p_ru.ru_inblock++;
KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
bp->b_flags |= B_READ;
- bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ bp->b_flags &= ~(B_ERROR | B_INVAL);
if (bp->b_rcred == NOCRED) {
if (cred != NOCRED)
crhold(cred);
@@ -403,7 +424,9 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
/*
* Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.
+ * read-ahead blocks. We must clear B_ERROR and B_INVAL prior
+ * to initiating I/O . If B_CACHE is set, the buffer is valid
+ * and we do not have to do anything.
*/
int
breadn(struct vnode * vp, daddr_t blkno, int size,
@@ -421,7 +444,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
if (curproc != NULL)
curproc->p_stats->p_ru.ru_inblock++;
bp->b_flags |= B_READ;
- bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ bp->b_flags &= ~(B_ERROR | B_INVAL);
if (bp->b_rcred == NOCRED) {
if (cred != NOCRED)
crhold(cred);
@@ -441,7 +464,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
if (curproc != NULL)
curproc->p_stats->p_ru.ru_inblock++;
rabp->b_flags |= B_READ | B_ASYNC;
- rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ rabp->b_flags &= ~(B_ERROR | B_INVAL);
if (rabp->b_rcred == NOCRED) {
if (cred != NOCRED)
crhold(cred);
@@ -462,7 +485,14 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
/*
* Write, release buffer on completion. (Done by iodone
- * if async.)
+ * if async). Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable. This is true even of NFS
+ * now so we set it generally. This could be set either here
+ * or in biodone() since the I/O is synchronous. We put it
+ * here.
*/
int
bwrite(struct buf * bp)
@@ -486,7 +516,7 @@ bwrite(struct buf * bp)
bundirty(bp);
bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
- bp->b_flags |= B_WRITEINPROG;
+ bp->b_flags |= B_WRITEINPROG | B_CACHE;
bp->b_vp->v_numoutput++;
vfs_busy_pages(bp, 1);
@@ -505,11 +535,12 @@ bwrite(struct buf * bp)
mp = vp->v_specmountpoint;
else
mp = vp->v_mount;
- if (mp != NULL)
+ if (mp != NULL) {
if ((oldflags & B_ASYNC) == 0)
mp->mnt_stat.f_syncwrites++;
else
mp->mnt_stat.f_asyncwrites++;
+ }
}
if ((oldflags & B_ASYNC) == 0) {
@@ -522,7 +553,13 @@ bwrite(struct buf * bp)
}
/*
- * Delayed write. (Buffer is marked dirty).
+ * Delayed write. (Buffer is marked dirty). Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE. In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
*/
void
bdwrite(struct buf * bp)
@@ -542,6 +579,12 @@ bdwrite(struct buf * bp)
bdirty(bp);
/*
+ * Set B_CACHE, indicating that the buffer is fully valid. This is
+ * true even of NFS now.
+ */
+ bp->b_flags |= B_CACHE;
+
+ /*
* This bmap keeps the system from needing to do the bmap later,
* perhaps when the system is attempting to do a sync. Since it
* is likely that the indirect block -- or whatever other datastructure
@@ -592,8 +635,11 @@ bdwrite(struct buf * bp)
* B_RELBUF, and we must set B_DELWRI. We reassign the buffer to
* itself to properly update it in the dirty/clean lists. We mark it
* B_DONE to ensure that any asynchronization of the buffer properly
- * clears B_DONE ( else a panic will occur later ). Note that B_INVALID
- * buffers are not considered dirty even if B_DELWRI is set.
+ * clears B_DONE ( else a panic will occur later ).
+ *
+ * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty()
+ * should only be called if the buffer is known-good.
*
* Since the buffer is not on a queue, we do not update the numfreebuffers
* count.
@@ -645,6 +691,9 @@ bundirty(bp)
*
* Asynchronous write. Start output on a buffer, but do not wait for
* it to complete. The buffer is released when the output completes.
+ *
+ * bwrite() ( or the VOP routine anyway ) is responsible for handling
+ * B_INVAL buffers. Not us.
*/
void
bawrite(struct buf * bp)
@@ -658,7 +707,8 @@ bawrite(struct buf * bp)
*
* Ordered write. Start output on a buffer, and flag it so that the
* device will write it in the order it was queued. The buffer is
- * released when the output completes.
+ * released when the output completes. bwrite() ( or the VOP routine
+ * anyway ) is responsible for handling B_INVAL buffers.
*/
int
bowrite(struct buf * bp)
@@ -694,10 +744,19 @@ brelse(struct buf * bp)
bp->b_flags &= ~B_ERROR;
if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) {
+ /*
+ * Failed write, redirty. Must clear B_ERROR to prevent
+ * pages from being scrapped. Note: B_INVAL is ignored
+ * here but will presumably be dealt with later.
+ */
bp->b_flags &= ~B_ERROR;
bdirty(bp);
} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
(bp->b_bufsize <= 0)) {
+ /*
+ * Either a failed I/O or we were asked to free or not
+ * cache the buffer.
+ */
bp->b_flags |= B_INVAL;
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
@@ -727,31 +786,22 @@ brelse(struct buf * bp)
/*
* VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
- * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
- * but the VM object is kept around. The B_NOCACHE flag is used to
- * invalidate the pages in the VM object.
+ * constituted, not even NFS buffers now. Two flags effect this. If
+ * B_INVAL, the struct buf is invalidated but the VM object is kept
+ * around ( i.e. so it is trivial to reconstitute the buffer later ).
*
- * The b_{validoff,validend,dirtyoff,dirtyend} values are relative
- * to b_offset and currently have byte granularity, whereas the
- * valid flags in the vm_pages have only DEV_BSIZE resolution.
- * The byte resolution fields are used to avoid unnecessary re-reads
- * of the buffer but the code really needs to be genericized so
- * other filesystem modules can take advantage of these fields.
+ * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
+ * invalidated. B_ERROR cannot be set for a failed write unless the
+ * buffer is also B_INVAL because it hits the re-dirtying code above.
*
- * XXX this seems to cause performance problems.
+ * Normally we can do this whether a buffer is B_DELWRI or not. If
+ * the buffer is an NFS buffer, it is tracking piecemeal writes or
+ * the commit state and we cannot afford to lose the buffer.
*/
if ((bp->b_flags & B_VMIO)
&& !(bp->b_vp->v_tag == VT_NFS &&
bp->b_vp->v_type != VBLK &&
- (bp->b_flags & B_DELWRI) != 0)
-#ifdef notdef
- && (bp->b_vp->v_tag != VT_NFS
- || bp->b_vp->v_type == VBLK
- || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
- || bp->b_validend == 0
- || (bp->b_validoff == 0
- && bp->b_validend == bp->b_bufsize))
-#endif
+ (bp->b_flags & B_DELWRI))
) {
int i, j, resid;
@@ -912,6 +962,11 @@ brelse(struct buf * bp)
/*
* Release a buffer back to the appropriate queue but do not try to free
* it.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion. It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
*/
void
bqrelse(struct buf * bp)
@@ -1096,6 +1151,8 @@ vfs_bio_awrite(struct buf * bp)
splx(s);
/*
* default (old) behavior, writing out only one block
+ *
+ * XXX returns b_bufsize instead of b_bcount for nwritten?
*/
nwritten = bp->b_bufsize;
(void) VOP_BWRITE(bp);
@@ -1107,7 +1164,11 @@ vfs_bio_awrite(struct buf * bp)
* getnewbuf:
*
* Find and initialize a new buffer header, freeing up existing buffers
- * in the bufqueues as necessary.
+ * in the bufqueues as necessary. The new buffer is returned with
+ * flags set to B_BUSY.
+ *
+ * Important: B_INVAL is not set. If the caller wishes to throw the
+ * buffer away, the caller must set B_INVAL prior to calling brelse().
*
* We block if:
* We have insufficient buffer headers
@@ -1368,7 +1429,6 @@ restart:
bp->b_bcount = 0;
bp->b_npages = 0;
bp->b_dirtyoff = bp->b_dirtyend = 0;
- bp->b_validoff = bp->b_validend = 0;
bp->b_usecount = 5;
LIST_INIT(&bp->b_dep);
@@ -1465,7 +1525,10 @@ dosleep:
}
bp->b_data = bp->b_kvabase;
}
-
+
+ /*
+ * The bp, if valid, is set to B_BUSY.
+ */
return (bp);
}
@@ -1546,9 +1609,10 @@ flushbufqueues(void)
}
/*
- * XXX NFS does weird things with B_INVAL bps if we bwrite
- * them ( vfs_bio_awrite/bawrite/bdwrite/etc ) Why?
- *
+ * Try to free up B_INVAL delayed-write buffers rather then
+ * writing them out. Note also that NFS is somewhat sensitive
+ * to B_INVAL buffers so it is doubly important that we do
+ * this.
*/
if ((bp->b_flags & B_DELWRI) != 0) {
if (bp->b_flags & B_INVAL) {
@@ -1622,20 +1686,28 @@ inmem(struct vnode * vp, daddr_t blkno)
}
/*
- * now we set the dirty range for the buffer --
- * for NFS -- if the file is mapped and pages have
- * been written to, let it know. We want the
- * entire range of the buffer to be marked dirty if
- * any of the pages have been written to for consistancy
- * with the b_validoff, b_validend set in the nfs write
- * code, and used by the nfs read code.
+ * vfs_setdirty:
+ *
+ * Sets the dirty range for a buffer based on the status of the dirty
+ * bits in the pages comprising the buffer.
+ *
+ * The range is limited to the size of the buffer.
+ *
+ * This routine is primarily used by NFS, but is generalized for the
+ * B_VMIO case.
*/
static void
vfs_setdirty(struct buf *bp)
{
int i;
vm_object_t object;
- vm_offset_t boffset;
+
+ /*
+ * Degenerate case - empty buffer
+ */
+
+ if (bp->b_bufsize == 0)
+ return;
/*
* We qualify the scan for modified pages on whether the
@@ -1654,6 +1726,9 @@ vfs_setdirty(struct buf *bp)
printf("Warning: object %p mightbedirty but not writeable\n", object);
if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
+ vm_offset_t boffset;
+ vm_offset_t eoffset;
+
/*
* test the pages to see if they have been modified directly
* by users through the VM system.
@@ -1664,47 +1739,85 @@ vfs_setdirty(struct buf *bp)
}
/*
- * scan forwards for the first page modified
+ * Calculate the encompassing dirty range, boffset and eoffset,
+ * (eoffset - boffset) bytes.
*/
+
for (i = 0; i < bp->b_npages; i++) {
- if (bp->b_pages[i]->dirty) {
+ if (bp->b_pages[i]->dirty)
break;
- }
}
-
boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
- if (boffset < bp->b_dirtyoff) {
- bp->b_dirtyoff = max(boffset, 0);
- }
- /*
- * scan backwards for the last page modified
- */
for (i = bp->b_npages - 1; i >= 0; --i) {
if (bp->b_pages[i]->dirty) {
break;
}
}
- boffset = (i + 1);
-#if 0
- offset = boffset + bp->b_pages[0]->pindex;
- if (offset >= object->size)
- boffset = object->size - bp->b_pages[0]->pindex;
-#endif
- boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
- if (bp->b_dirtyend < boffset)
- bp->b_dirtyend = min(boffset, bp->b_bufsize);
+ eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+ /*
+ * Fit it to the buffer.
+ */
+
+ if (eoffset > bp->b_bcount)
+ eoffset = bp->b_bcount;
+
+ /*
+ * If we have a good dirty range, merge with the existing
+ * dirty range.
+ */
+
+ if (boffset < eoffset) {
+ if (bp->b_dirtyoff > boffset)
+ bp->b_dirtyoff = boffset;
+ if (bp->b_dirtyend < eoffset)
+ bp->b_dirtyend = eoffset;
+ }
}
}
/*
- * Get a block given a specified block and offset into a file/device.
+ * getblk:
+ *
+ * Get a block given a specified block and offset into a file/device.
+ * The buffers B_DONE bit will be cleared on return, making it almost
+ * ready for an I/O initiation. B_INVAL may or may not be set on
+ * return. The caller should clear B_INVAL prior to initiating a
+ * READ.
+ *
+ * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ * an existing buffer.
+ *
+ * For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ * and then cleared based on the backing VM. If the previous buffer is
+ * non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ * If getblk() must create a new buffer, the new buffer is returned with
+ * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ * case it is returned with B_INVAL clear and B_CACHE set based on the
+ * backing VM.
+ *
+ * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
+ * B_CACHE bit is clear.
+ *
+ * What this means, basically, is that the caller should use B_CACHE to
+ * determine whether the buffer is fully valid or not and should clear
+ * B_INVAL prior to issuing a read. If the caller intends to validate
+ * the buffer by loading its data area with something, the caller needs
+ * to clear B_INVAL. If the caller does this without issuing an I/O,
+ * the caller should set B_CACHE ( as an optimization ), else the caller
+ * should issue the I/O and biodone() will set B_CACHE if the I/O was
+ * a write attempt or if it was a successfull read. If the caller
+ * intends to issue a READ, the caller must clear B_INVAL and B_ERROR
+ * prior to issuing the READ. biodone() will *not* clear B_INVAL.
*/
struct buf *
getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
struct buf *bp;
- int i, s;
+ int s;
struct bufhashhdr *bh;
#if !defined(MAX_PERF)
@@ -1727,6 +1840,10 @@ loop:
}
if ((bp = gbincore(vp, blkno))) {
+ /*
+ * Buffer is in-core
+ */
+
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
if (bp->b_usecount < BUF_MAXUSE)
@@ -1740,7 +1857,18 @@ loop:
splx(s);
return (struct buf *) NULL;
}
- bp->b_flags |= B_BUSY | B_CACHE;
+
+ /*
+ * Busy the buffer. B_CACHE is cleared if the buffer is
+ * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set
+ * and for a VMIO buffer B_CACHE is adjusted according to the
+ * backing VM cache.
+ */
+ bp->b_flags |= B_BUSY;
+ if (bp->b_flags & B_INVAL)
+ bp->b_flags &= ~B_CACHE;
+ else if ((bp->b_flags & (B_VMIO|B_INVAL)) == 0)
+ bp->b_flags |= B_CACHE;
bremfree(bp);
/*
@@ -1770,7 +1898,9 @@ loop:
/*
* If the size is inconsistant in the VMIO case, we can resize
- * the buffer. This might lead to B_CACHE getting cleared.
+ * the buffer. This might lead to B_CACHE getting set or
+ * cleared. If the size has not changed, B_CACHE remains
+ * unchanged from its previous state.
*/
if (bp->b_bcount != size)
@@ -1780,45 +1910,19 @@ loop:
("getblk: no buffer offset"));
/*
- * Check that the constituted buffer really deserves for the
- * B_CACHE bit to be set. B_VMIO type buffers might not
- * contain fully valid pages. Normal (old-style) buffers
- * should be fully valid. This might also lead to B_CACHE
- * getting clear.
+ * A buffer with B_DELWRI set and B_CACHE clear must
+ * be committed before we can return the buffer in
+ * order to prevent the caller from issuing a read
+ * ( due to B_CACHE not being set ) and overwriting
+ * it.
*
- * If B_CACHE is already clear, don't bother checking to see
- * if we have to clear it again.
- *
- * XXX this code should not be necessary unless the B_CACHE
- * handling is broken elsewhere in the kernel. We need to
- * check the cases and then turn the clearing part of this
- * code into a panic.
- */
- if (
- (bp->b_flags & (B_VMIO|B_CACHE)) == (B_VMIO|B_CACHE) &&
- (bp->b_vp->v_tag != VT_NFS || bp->b_validend <= 0)
- ) {
- int checksize = bp->b_bufsize;
- int poffset = bp->b_offset & PAGE_MASK;
- int resid;
- for (i = 0; i < bp->b_npages; i++) {
- resid = (checksize > (PAGE_SIZE - poffset)) ?
- (PAGE_SIZE - poffset) : checksize;
- if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
- bp->b_flags &= ~(B_CACHE | B_DONE);
- break;
- }
- checksize -= resid;
- poffset = 0;
- }
- }
-
- /*
- * If B_DELWRI is set and B_CACHE got cleared ( or was
- * already clear ), we have to commit the write and
- * retry. The NFS code absolutely depends on this,
- * and so might the FFS code. In anycase, it formalizes
- * the B_CACHE rules. See sys/buf.h.
+ * Most callers, including NFS and FFS, need this to
+ * operate properly either because they assume they
+ * can issue a read if B_CACHE is not set, or because
+ * ( for example ) an uncached B_DELWRI might loop due
+ * to softupdates re-dirtying the buffer. In the latter
+ * case, B_CACHE is set after the first write completes,
+ * preventing further loops.
*/
if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
@@ -1829,8 +1933,14 @@ loop:
if (bp->b_usecount < BUF_MAXUSE)
++bp->b_usecount;
splx(s);
- return (bp);
+ bp->b_flags &= ~B_DONE;
} else {
+ /*
+ * Buffer is not in-core, create new buffer. The buffer
+ * returned by getnewbuf() is marked B_BUSY. Note that the
+ * returned buffer is also considered valid ( not marked
+ * B_INVAL ).
+ */
int bsize, maxsize, vmio;
off_t offset;
@@ -1849,7 +1959,7 @@ loop:
maxsize = imax(maxsize, bsize);
if ((bp = getnewbuf(vp, blkno,
- slpflag, slptimeo, size, maxsize)) == 0) {
+ slpflag, slptimeo, size, maxsize)) == NULL) {
if (slpflag || slptimeo) {
splx(s);
return NULL;
@@ -1861,6 +1971,10 @@ loop:
* This code is used to make sure that a buffer is not
* created while the getnewbuf routine is blocked.
* This can be a problem whether the vnode is locked or not.
+ * If the buffer is created out from under us, we have to
+ * throw away the one we just created. There is now window
+ * race because we are safely running at splbio() from the
+ * point of the duplicate buffer creation through to here.
*/
if (gbincore(vp, blkno)) {
bp->b_flags |= B_INVAL;
@@ -1880,8 +1994,15 @@ loop:
bh = BUFHASH(vp, blkno);
LIST_INSERT_HEAD(bh, bp, b_hash);
+ /*
+ * set B_VMIO bit. allocbuf() the buffer bigger. Since the
+ * buffer size starts out as 0, B_CACHE will be set by
+ * allocbuf() for the VMIO case prior to it testing the
+ * backing store for validity.
+ */
+
if (vmio) {
- bp->b_flags |= (B_VMIO | B_CACHE);
+ bp->b_flags |= B_VMIO;
#if defined(VFS_BIO_DEBUG)
if (vp->v_type != VREG && vp->v_type != VBLK)
printf("getblk: vmioing file type %d???\n", vp->v_type);
@@ -1893,12 +2014,14 @@ loop:
allocbuf(bp, size);
splx(s);
- return (bp);
+ bp->b_flags &= ~B_DONE;
}
+ return (bp);
}
/*
- * Get an empty, disassociated buffer of given size.
+ * Get an empty, disassociated buffer of given size. The buffer is initially
+ * set to B_INVAL.
*/
struct buf *
geteblk(int size)
@@ -1910,7 +2033,7 @@ geteblk(int size)
while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
splx(s);
allocbuf(bp, size);
- bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+ bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
return (bp);
}
@@ -1925,6 +2048,9 @@ geteblk(int size)
* deadlock or inconsistant data situations. Tread lightly!!!
* There are B_CACHE and B_DELWRI interactions that must be dealt with by
* the caller. Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
+ * B_CACHE for the non-VMIO case.
*/
int
@@ -1945,7 +2071,8 @@ allocbuf(struct buf *bp, int size)
caddr_t origbuf;
int origbufsize;
/*
- * Just get anonymous memory from the kernel
+ * Just get anonymous memory from the kernel. Don't
+ * mess with B_CACHE.
*/
mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
#if !defined(NO_B_MALLOC)
@@ -2046,13 +2173,25 @@ allocbuf(struct buf *bp, int size)
if (bp->b_flags & B_MALLOC)
panic("allocbuf: VMIO buffer can't be malloced");
#endif
+ /*
+ * Set B_CACHE initially if buffer is 0 length or will become
+ * 0-length.
+ */
+ if (size == 0 || bp->b_bufsize == 0)
+ bp->b_flags |= B_CACHE;
if (newbsize < bp->b_bufsize) {
+ /*
+ * DEV_BSIZE aligned new buffer size is less then the
+ * DEV_BSIZE aligned existing buffer size. Figure out
+ * if we have to remove any pages.
+ */
if (desiredpages < bp->b_npages) {
for (i = desiredpages; i < bp->b_npages; i++) {
/*
* the page is not freed here -- it
- * is the responsibility of vnode_pager_setsize
+ * is the responsibility of
+ * vnode_pager_setsize
*/
m = bp->b_pages[i];
KASSERT(m != bogus_page,
@@ -2067,115 +2206,131 @@ allocbuf(struct buf *bp, int size)
(desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
bp->b_npages = desiredpages;
}
- } else if (newbsize > bp->b_bufsize) {
- vm_object_t obj;
- vm_offset_t tinc, toff;
- vm_ooffset_t off;
- vm_pindex_t objoff;
- int pageindex, curbpnpages;
+ } else if (size > bp->b_bcount) {
+ /*
+ * We are growing the buffer, possibly in a
+ * byte-granular fashion.
+ */
struct vnode *vp;
- int bsize;
- int orig_validoff = bp->b_validoff;
- int orig_validend = bp->b_validend;
-
- vp = bp->b_vp;
-
- if (vp->v_type == VBLK)
- bsize = DEV_BSIZE;
- else
- bsize = vp->v_mount->mnt_stat.f_iosize;
-
- if (bp->b_npages < desiredpages) {
- obj = vp->v_object;
- tinc = PAGE_SIZE;
+ vm_object_t obj;
+ vm_offset_t toff;
+ vm_offset_t tinc;
- off = bp->b_offset;
- KASSERT(bp->b_offset != NOOFFSET,
- ("allocbuf: no buffer offset"));
- curbpnpages = bp->b_npages;
- doretry:
- bp->b_validoff = orig_validoff;
- bp->b_validend = orig_validend;
- bp->b_flags |= B_CACHE;
- for (toff = 0; toff < newbsize; toff += tinc) {
- objoff = OFF_TO_IDX(off + toff);
- pageindex = objoff - OFF_TO_IDX(off);
- tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
- if (pageindex < curbpnpages) {
-
- m = bp->b_pages[pageindex];
-#ifdef VFS_BIO_DIAG
- if (m->pindex != objoff)
- panic("allocbuf: page changed offset?!!!?");
-#endif
- if (tinc > (newbsize - toff))
- tinc = newbsize - toff;
- if (bp->b_flags & B_CACHE)
- vfs_buf_set_valid(bp, off, toff, tinc, m);
- continue;
- }
- m = vm_page_lookup(obj, objoff);
- if (!m) {
- m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
- if (!m) {
- VM_WAIT;
- vm_pageout_deficit += (desiredpages - curbpnpages);
- goto doretry;
- }
+ /*
+ * Step 1, bring in the VM pages from the object,
+ * allocating them if necessary. We must clear
+ * B_CACHE if these pages are not valid for the
+ * range covered by the buffer.
+ */
+ vp = bp->b_vp;
+ obj = vp->v_object;
+
+ while (bp->b_npages < desiredpages) {
+ vm_page_t m;
+ vm_pindex_t pi;
+
+ pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
+ if ((m = vm_page_lookup(obj, pi)) == NULL) {
+ m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
+ if (m == NULL) {
+ VM_WAIT;
+ vm_pageout_deficit += desiredpages - bp->b_npages;
+ } else {
vm_page_wire(m);
vm_page_wakeup(m);
bp->b_flags &= ~B_CACHE;
-
- } else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) {
- /*
- * If we had to sleep, retry.
- *
- * Also note that we only test
- * PG_BUSY here, not m->busy.
- *
- * We cannot sleep on m->busy
- * here because a vm_fault ->
- * getpages -> cluster-read ->
- * ...-> allocbuf sequence
- * will convert PG_BUSY to
- * m->busy so we have to let
- * m->busy through if we do
- * not want to deadlock.
- */
- goto doretry;
- } else {
- if ((curproc != pageproc) &&
- ((m->queue - m->pc) == PQ_CACHE) &&
- ((cnt.v_free_count + cnt.v_cache_count) <
- (cnt.v_free_min + cnt.v_cache_min))) {
- pagedaemon_wakeup();
- }
- if (tinc > (newbsize - toff))
- tinc = newbsize - toff;
- if (bp->b_flags & B_CACHE)
- vfs_buf_set_valid(bp, off, toff, tinc, m);
- vm_page_flag_clear(m, PG_ZERO);
- vm_page_wire(m);
+ bp->b_pages[bp->b_npages] = m;
+ ++bp->b_npages;
}
- bp->b_pages[pageindex] = m;
- curbpnpages = pageindex + 1;
+ continue;
}
- if (vp->v_tag == VT_NFS &&
- vp->v_type != VBLK) {
- if (bp->b_dirtyend > 0) {
- bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
- bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
- }
- if (bp->b_validend == 0)
- bp->b_flags &= ~B_CACHE;
+
+ /*
+ * We found a page. If we have to sleep on it,
+ * retry because it might have gotten freed out
+ * from under us.
+ *
+ * We can only test PG_BUSY here. Blocking on
+ * m->busy might lead to a deadlock:
+ *
+ * vm_fault->getpages->cluster_read->allocbuf
+ *
+ */
+
+ if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
+ continue;
+
+ /*
+ * We have a good page. Should we wakeup the
+ * page daemon?
+ */
+ if ((curproc != pageproc) &&
+ ((m->queue - m->pc) == PQ_CACHE) &&
+ ((cnt.v_free_count + cnt.v_cache_count) <
+ (cnt.v_free_min + cnt.v_cache_min))
+ ) {
+ pagedaemon_wakeup();
}
- bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
- bp->b_npages = curbpnpages;
- pmap_qenter((vm_offset_t) bp->b_data,
- bp->b_pages, bp->b_npages);
- ((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
+ vm_page_flag_clear(m, PG_ZERO);
+ vm_page_wire(m);
+ bp->b_pages[bp->b_npages] = m;
+ ++bp->b_npages;
}
+
+ /*
+ * Step 2. We've loaded the pages into the buffer,
+ * we have to figure out if we can still have B_CACHE
+ * set. Note that B_CACHE is set according to the
+ * byte-granular range ( bcount and size ), new the
+ * aligned range ( newbsize ).
+ *
+ * The VM test is against m->valid, which is DEV_BSIZE
+ * aligned. Needless to say, the validity of the data
+ * needs to also be DEV_BSIZE aligned. Note that this
+ * fails with NFS if the server or some other client
+ * extends the file's EOF. If our buffer is resized,
+ * B_CACHE may remain set! XXX
+ */
+
+ toff = bp->b_bcount;
+ tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+
+ while ((bp->b_flags & B_CACHE) && toff < size) {
+ vm_pindex_t pi;
+
+ if (tinc > (size - toff))
+ tinc = size - toff;
+
+ pi = ((bp->b_offset & PAGE_MASK) + toff) >>
+ PAGE_SHIFT;
+
+ vfs_buf_test_cache(
+ bp,
+ bp->b_offset,
+ toff,
+ tinc,
+ bp->b_pages[pi]
+ );
+ toff += tinc;
+ tinc = PAGE_SIZE;
+ }
+
+ /*
+ * Step 3, fixup the KVM pmap. Remember that
+ * bp->b_data is relative to bp->b_offset, but
+ * bp->b_offset may be offset into the first page.
+ */
+
+ bp->b_data = (caddr_t)
+ trunc_page((vm_offset_t)bp->b_data);
+ pmap_qenter(
+ (vm_offset_t)bp->b_data,
+ bp->b_pages,
+ bp->b_npages
+ );
+ bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+ (vm_offset_t)(bp->b_offset & PAGE_MASK));
}
}
if (bp->b_flags & B_VMIO)
@@ -2184,13 +2339,17 @@ allocbuf(struct buf *bp, int size)
runningbufspace += (newbsize - bp->b_bufsize);
if (newbsize < bp->b_bufsize)
bufspacewakeup();
- bp->b_bufsize = newbsize;
- bp->b_bcount = size;
+ bp->b_bufsize = newbsize; /* actual buffer allocation */
+ bp->b_bcount = size; /* requested buffer size */
return 1;
}
/*
- * Wait for buffer I/O completion, returning error status.
+ * biowait:
+ *
+ * Wait for buffer I/O completion, returning error status. The buffer
+ * is left B_BUSY|B_DONE on return. B_EINTR is converted into a EINTR
+ * error and cleared.
*/
int
biowait(register struct buf * bp)
@@ -2220,9 +2379,23 @@ biowait(register struct buf * bp)
}
/*
- * Finish I/O on a buffer, calling an optional function.
- * This is usually called from interrupt level, so process blocking
- * is not *a good idea*.
+ * biodone:
+ *
+ * Finish I/O on a buffer, optionally calling a completion function.
+ * This is usually called from an interrupt so process blocking is
+ * not allowed.
+ *
+ * biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ * In a non-VMIO bp, B_CACHE will be set on the next getblk()
+ * assuming B_INVAL is clear.
+ *
+ * For the VMIO case, we set B_CACHE if the op was a read and no
+ * read error occured, or if the op was a write. B_CACHE is never
+ * set if the buffer is invalid or otherwise uncacheable.
+ *
+ * biodone does not mess with B_INVAL, allowing the I/O routine or the
+ * initiator to leave B_INVAL set to brelse the buffer out of existance
+ * in the biodone routine.
*/
void
biodone(register struct buf * bp)
@@ -2295,7 +2468,17 @@ biodone(register struct buf * bp)
obj->paging_in_progress, bp->b_npages);
}
#endif
- iosize = bp->b_bufsize;
+
+ /*
+ * Set B_CACHE if the op was a normal read and no error
+ * occured. B_CACHE is set for writes in the b*write()
+ * routines.
+ */
+ iosize = bp->b_bcount;
+ if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
+ bp->b_flags |= B_CACHE;
+ }
+
for (i = 0; i < bp->b_npages; i++) {
int bogusflag = 0;
m = bp->b_pages[i];
@@ -2307,6 +2490,7 @@ biodone(register struct buf * bp)
printf("biodone: page disappeared\n");
#endif
vm_object_pip_subtract(obj, 1);
+ bp->b_flags &= ~B_CACHE;
continue;
}
bp->b_pages[i] = m;
@@ -2325,8 +2509,8 @@ biodone(register struct buf * bp)
/*
* In the write case, the valid and clean bits are
- * already changed correctly, so we only need to do this
- * here in the read case.
+ * already changed correctly ( see bdwrite() ), so we
+ * only need to do this here in the read case.
*/
if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
vfs_page_set_valid(bp, foff, i, m);
@@ -2453,106 +2637,45 @@ vfs_unbusy_pages(struct buf * bp)
}
/*
- * Set NFS' b_validoff and b_validend fields from the valid bits
- * of a page. If the consumer is not NFS, and the page is not
- * valid for the entire range, clear the B_CACHE flag to force
- * the consumer to re-read the page.
+ * vfs_page_set_valid:
*
- * B_CACHE interaction is especially tricky.
- */
-static void
-vfs_buf_set_valid(struct buf *bp,
- vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
- vm_page_t m)
-{
- if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
- vm_offset_t svalid, evalid;
- int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
-
- /*
- * This only bothers with the first valid range in the
- * page.
- */
- svalid = off;
- while (validbits && !(validbits & 1)) {
- svalid += DEV_BSIZE;
- validbits >>= 1;
- }
- evalid = svalid;
- while (validbits & 1) {
- evalid += DEV_BSIZE;
- validbits >>= 1;
- }
- evalid = min(evalid, off + size);
- /*
- * We can only set b_validoff/end if this range is contiguous
- * with the range built up already. If we cannot set
- * b_validoff/end, we must clear B_CACHE to force an update
- * to clean the bp up.
- */
- if (svalid == bp->b_validend) {
- bp->b_validoff = min(bp->b_validoff, svalid);
- bp->b_validend = max(bp->b_validend, evalid);
- } else {
- bp->b_flags &= ~B_CACHE;
- }
- } else if (!vm_page_is_valid(m,
- (vm_offset_t) ((foff + off) & PAGE_MASK),
- size)) {
- bp->b_flags &= ~B_CACHE;
- }
-}
-
-/*
- * Set the valid bits in a page, taking care of the b_validoff,
- * b_validend fields which NFS uses to optimise small reads. Off is
- * the offset within the file and pageno is the page index within the buf.
+ * Set the valid bits in a page based on the supplied offset. The
+ * range is restricted to the buffer's size.
*
- * XXX we have to set the valid & clean bits for all page fragments
- * touched by b_validoff/validend, even if the page fragment goes somewhat
- * beyond b_validoff/validend due to alignment.
+ * For NFS, the range is additionally restricted to b_validoff/end.
+ * validoff/end must be DEV_BSIZE chunky or the end must be at the
+ * file EOF. If a dirty range exists, set the page's dirty bits
+ * inclusively.
+ *
+ * This routine is typically called after a read completes.
*/
static void
vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
{
- struct vnode *vp = bp->b_vp;
vm_ooffset_t soff, eoff;
/*
* Start and end offsets in buffer. eoff - soff may not cross a
- * page boundry or cross the end of the buffer.
+ * page boundry or cross the end of the buffer. The end of the
+ * buffer, in this case, is our file EOF, not the allocation size
+ * of the buffer.
*/
soff = off;
eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
- if (eoff > bp->b_offset + bp->b_bufsize)
- eoff = bp->b_offset + bp->b_bufsize;
-
- if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
- vm_ooffset_t sv, ev;
- vm_page_set_invalid(m,
- (vm_offset_t) (soff & PAGE_MASK),
- (vm_offset_t) (eoff - soff));
- /*
- * bp->b_validoff and bp->b_validend restrict the valid range
- * that we can set. Note that these offsets are not DEV_BSIZE
- * aligned. vm_page_set_validclean() must know what
- * sub-DEV_BSIZE ranges to clear.
- */
-#if 0
- sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
- ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) &
- ~(DEV_BSIZE - 1);
-#endif
- sv = bp->b_offset + bp->b_validoff;
- ev = bp->b_offset + bp->b_validend;
- soff = qmax(sv, soff);
- eoff = qmin(ev, eoff);
- }
+ if (eoff > bp->b_offset + bp->b_bcount)
+ eoff = bp->b_offset + bp->b_bcount;
- if (eoff > soff)
- vm_page_set_validclean(m,
- (vm_offset_t) (soff & PAGE_MASK),
- (vm_offset_t) (eoff - soff));
+ /*
+ * Set valid range. This is typically the entire buffer and thus the
+ * entire page.
+ */
+ if (eoff > soff) {
+ vm_page_set_validclean(
+ m,
+ (vm_offset_t) (soff & PAGE_MASK),
+ (vm_offset_t) (eoff - soff)
+ );
+ }
}
/*
@@ -2562,6 +2685,10 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
* almost as being PG_BUSY. Also the object paging_in_progress
* flag is handled to make sure that the object doesn't become
* inconsistant.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as B_ERROR or B_INVAL may be in an inconsistant state
+ * and should be ignored.
*/
void
vfs_busy_pages(struct buf * bp, int clear_modify)
@@ -2595,6 +2722,22 @@ retry:
vm_page_io_start(m);
}
+ /*
+ * When readying a buffer for a read ( i.e
+ * clear_modify == 0 ), it is important to do
+ * bogus_page replacement for valid pages in
+ * partially instantiated buffers. Partially
+ * instantiated buffers can, in turn, occur when
+ * reconstituting a buffer from its VM backing store
+ * base. We only have to do this if B_CACHE is
+ * clear ( which causes the I/O to occur in the
+ * first place ). The replacement prevents the read
+ * I/O from overwriting potentially dirty VM-backed
+ * pages. XXX bogus page replacement is, uh, bogus.
+ * It may not work properly with small-block devices.
+ * We need to find a better way.
+ */
+
vm_page_protect(m, VM_PROT_NONE);
if (clear_modify)
vfs_page_set_valid(bp, foff, i, m);
@@ -2614,30 +2757,89 @@ retry:
* Tell the VM system that the pages associated with this buffer
* are clean. This is used for delayed writes where the data is
* going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
*/
-void
+static void
vfs_clean_pages(struct buf * bp)
{
int i;
if (bp->b_flags & B_VMIO) {
vm_ooffset_t foff;
+
foff = bp->b_offset;
KASSERT(bp->b_offset != NOOFFSET,
("vfs_clean_pages: no buffer offset"));
for (i = 0; i < bp->b_npages; i++) {
vm_page_t m = bp->b_pages[i];
+ vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+ vm_ooffset_t eoff = noff;
+
+ if (eoff > bp->b_offset + bp->b_bufsize)
+ eoff = bp->b_offset + bp->b_bufsize;
vfs_page_set_valid(bp, foff, i, m);
- foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+ /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+ foff = noff;
}
}
}
+/*
+ * vfs_bio_set_validclean:
+ *
+ * Set the range within the buffer to valid and clean. The range is
+ * relative to the beginning of the buffer, b_offset. Note that b_offset
+ * itself may be offset from the beginning of the first page.
+ */
+
+void
+vfs_bio_set_validclean(struct buf *bp, int base, int size)
+{
+ if (bp->b_flags & B_VMIO) {
+ int i;
+ int n;
+
+ /*
+ * Fixup base to be relative to beginning of first page.
+ * Set initial n to be the maximum number of bytes in the
+ * first page that can be validated.
+ */
+
+ base += (bp->b_offset & PAGE_MASK);
+ n = PAGE_SIZE - (base & PAGE_MASK);
+
+ for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+ vm_page_t m = bp->b_pages[i];
+
+ if (n > size)
+ n = size;
+
+ vm_page_set_validclean(m, base & PAGE_MASK, n);
+ base += n;
+ size -= n;
+ n = PAGE_SIZE;
+ }
+ }
+}
+
+/*
+ * vfs_bio_clrbuf:
+ *
+ * clear a buffer. This routine essentially fakes an I/O, so we need
+ * to clear B_ERROR and B_INVAL.
+ *
+ * Note that while we only theoretically need to clear through b_bcount,
+ * we go ahead and clear through b_bufsize.
+ */
+
void
vfs_bio_clrbuf(struct buf *bp) {
int i, mask = 0;
caddr_t sa, ea;
if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+ bp->b_flags &= ~(B_INVAL|B_ERROR);
if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
(bp->b_offset & PAGE_MASK) == 0) {
mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index f7bd95e..5f7f870 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $
+ * $Id: vfs_cluster.c,v 1.80 1999/03/12 02:24:56 julian Exp $
*/
#include "opt_debug_cluster.h"
@@ -251,6 +251,7 @@ single_block_read:
#endif
if ((bp->b_flags & B_CLUSTER) == 0)
vfs_busy_pages(bp, 0);
+ bp->b_flags &= ~(B_ERROR|B_INVAL);
error = VOP_STRATEGY(vp, bp);
curproc->p_stats->p_ru.ru_inblock++;
}
@@ -283,6 +284,7 @@ single_block_read:
if ((rbp->b_flags & B_CLUSTER) == 0)
vfs_busy_pages(rbp, 0);
+ rbp->b_flags &= ~(B_ERROR|B_INVAL);
(void) VOP_STRATEGY(vp, rbp);
curproc->p_stats->p_ru.ru_inblock++;
}
@@ -473,8 +475,10 @@ cluster_callback(bp)
if (error) {
tbp->b_flags |= B_ERROR;
tbp->b_error = error;
- } else
- tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+ } else {
+ tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+ tbp->b_flags &= ~(B_ERROR|B_INVAL);
+ }
biodone(tbp);
}
relpbuf(bp, &cluster_pbuf_freecnt);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index c0565a4..de5d18d 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -138,6 +138,18 @@ vop_panic(struct vop_generic_args *ap)
panic("illegal vnode op called");
}
+/*
+ * vop_nostrategy:
+ *
+ * Strategy routine for VFS devices that have none.
+ *
+ * B_ERROR and B_INVAL must be cleared prior to calling any strategy
+ * routine. Typically this is done for a B_READ strategy call. Typically
+ * B_INVAL is assumed to already be clear prior to a write and should not
+ * be cleared manually unless you just made the buffer invalid. B_ERROR
+ * should be cleared either way.
+ */
+
static int
vop_nostrategy (struct vop_strategy_args *ap)
{
diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfs/nfs.h
+++ b/sys/nfs/nfs.h
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs.h 8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
*/
#ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *));
void nfs_safedisconnect __P((struct nfsmount *));
int nfs_getattrcache __P((struct vnode *, struct vattr *));
int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
- int));
+int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
void nfsrv_init __P((int));
void nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index cef982b..0d8a782 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
*/
@@ -65,7 +65,6 @@
static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));
extern int nfs_numasync;
extern int nfs_pbuf_freecnt;
@@ -84,7 +83,7 @@ nfs_getpages(ap)
vm_ooffset_t a_offset;
} */ *ap;
{
- int i, error, nextoff, size, toff, npages, count;
+ int i, error, nextoff, size, toff, count, npages;
struct uio uio;
struct iovec iov;
vm_offset_t kva;
@@ -110,13 +109,35 @@ nfs_getpages(ap)
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
(void)nfs_fsinfo(nmp, vp, cred, p);
+
+ npages = btoc(count);
+
+ /*
+ * If the requested page is partially valid, just return it and
+ * allow the pager to zero-out the blanks. Partially valid pages
+ * can only occur at the file EOF.
+ */
+
+ {
+ vm_page_t m = pages[ap->a_reqpage];
+
+ if (m->valid != 0) {
+ /* handled by vm_fault now */
+ /* vm_page_zero_invalid(m, TRUE); */
+ for (i = 0; i < npages; ++i) {
+ if (i != ap->a_reqpage)
+ vnode_pager_freepage(pages[i]);
+ }
+ return(0);
+ }
+ }
+
/*
* We use only the kva address for the buffer, but this is extremely
* convienient and fast.
*/
bp = getpbuf(&nfs_pbuf_freecnt);
- npages = btoc(count);
kva = (vm_offset_t) bp->b_data;
pmap_qenter(kva, pages, npages);
@@ -167,12 +188,12 @@ nfs_getpages(ap)
m->dirty = 0;
} else if (size > toff) {
/*
- * Read operation filled a partial page, set valid
- * bits properly. validclean will zero out
- * any cruft in the buffer when setting a valid bit,
- * if the size is not DEV_BSIZE aligned.
+ * Read operation filled a partial page.
*/
+ m->valid = 0;
vm_page_set_validclean(m, 0, size - toff);
+ /* handled by vm_fault now */
+ /* vm_page_zero_invalid(m, TRUE); */
}
if (i != ap->a_reqpage) {
@@ -197,13 +218,6 @@ nfs_getpages(ap)
} else {
vnode_pager_freepage(m);
}
- } else {
- /*
- * This page is being mapped, clear out any other
- * cruft in the invalid areas of the page.
- */
- if (m->valid && m->valid != VM_PAGE_BITS_ALL)
- vm_page_zero_invalid(m, FALSE);
}
}
return 0;
@@ -228,14 +242,17 @@ nfs_putpages(ap)
vm_offset_t kva;
struct buf *bp;
int iomode, must_commit, i, error, npages, count;
+ off_t offset;
int *rtvals;
struct vnode *vp;
struct proc *p;
struct ucred *cred;
struct nfsmount *nmp;
+ struct nfsnode *np;
vm_page_t *pages;
vp = ap->a_vp;
+ np = VTONFS(vp);
p = curproc; /* XXX */
cred = curproc->p_ucred; /* XXX */
nmp = VFSTONFS(vp->v_mount);
@@ -243,6 +260,7 @@ nfs_putpages(ap)
count = ap->a_count;
rtvals = ap->a_rtvals;
npages = btoc(count);
+ offset = IDX_TO_OFF(pages[0]->pindex);
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@@ -253,6 +271,16 @@ nfs_putpages(ap)
}
/*
+ * When putting pages, do not extend file past EOF.
+ */
+
+ if (offset + count > np->n_size) {
+ count = np->n_size - offset;
+ if (count < 0)
+ count = 0;
+ }
+
+ /*
* We use only the kva address for the buffer, but this is extremely
* convienient and fast.
*/
@@ -265,7 +293,7 @@ nfs_putpages(ap)
iov.iov_len = count;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
- uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+ uio.uio_offset = offset;
uio.uio_resid = count;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_WRITE;
@@ -297,23 +325,21 @@ nfs_putpages(ap)
* Vnode op for read using bio
*/
int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
register struct vnode *vp;
register struct uio *uio;
int ioflag;
struct ucred *cred;
- int getpages;
{
register struct nfsnode *np = VTONFS(vp);
register int biosize, i;
- off_t diff;
struct buf *bp = 0, *rabp;
struct vattr vattr;
struct proc *p;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
daddr_t lbn, rabn;
- int bufsize;
- int nra, error = 0, n = 0, on = 0, not_readin;
+ int bcount;
+ int nra, error = 0, n = 0, on = 0;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ)
@@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
nfsstats.biocache_reads++;
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
- not_readin = 1;
/*
* Start the read ahead(s), as required.
@@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
return (EINTR);
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
- rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
}
/*
- * If the block is in the cache and has the required data
- * in a valid region, just copy it out.
- * Otherwise, get the block and write back/read in,
- * as required.
+ * Obtain the buffer cache block. Figure out the buffer size
+ * when we are at EOF. nfs_getcacheblk() will also force
+ * uncached delayed-writes to be flushed to the server.
+ *
+ * Note that bcount is *not* DEV_BSIZE aligned.
*/
-again:
- bufsize = biosize;
- if ((off_t)(lbn + 1) * biosize > np->n_size &&
- (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
- bufsize = np->n_size - (off_t)lbn * biosize;
- bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+ bcount = biosize;
+ if ((off_t)lbn * biosize >= np->n_size) {
+ bcount = 0;
+ } else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+ bcount = np->n_size - (off_t)lbn * biosize;
}
- bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
if (!bp)
return (EINTR);
/*
- * If we are being called from nfs_getpages, we must
- * make sure the buffer is a vmio buffer. The vp will
- * already be setup for vmio but there may be some old
- * non-vmio buffers attached to it.
+ * If B_CACHE is not set, we must issue the read. If this
+ * fails, we return an error.
*/
- if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
- printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
- bp->b_flags |= B_NOCACHE;
- bp->b_flags |= B_INVAFTERWRITE;
- if (bp->b_dirtyend > 0) {
- if ((bp->b_flags & B_DELWRI) == 0)
- panic("nfsbioread");
- if (VOP_BWRITE(bp) == EINTR)
- return (EINTR);
- } else
- brelse(bp);
- goto again;
- }
+
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
- bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
- not_readin = 0;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -501,32 +509,20 @@ again:
return (error);
}
}
- if (bufsize > on) {
- n = min((unsigned)(bufsize - on), uio->uio_resid);
- } else {
- n = 0;
- }
- diff = np->n_size - uio->uio_offset;
- if (diff < n)
- n = diff;
- if (not_readin && n > 0) {
- if (on < bp->b_validoff || (on + n) > bp->b_validend) {
- bp->b_flags |= B_NOCACHE;
- bp->b_flags |= B_INVAFTERWRITE;
- if (bp->b_dirtyend > 0) {
- if ((bp->b_flags & B_DELWRI) == 0)
- panic("nfsbioread");
- if (VOP_BWRITE(bp) == EINTR)
- return (EINTR);
- } else
- brelse(bp);
- goto again;
- }
- }
+
+ /*
+ * on is the offset into the current bp. Figure out how many
+ * bytes we can copy out of the bp. Note that bcount is
+ * NOT DEV_BSIZE aligned.
+ *
+ * Then figure out how many bytes we can copy into the uio.
+ */
+
+ n = 0;
+ if (on < bcount)
+ n = min((unsigned)(bcount - on), uio->uio_resid);
+
vp->v_lastr = lbn;
- diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
- if (diff < n)
- n = diff;
break;
case VLNK:
nfsstats.biocache_readlinks++;
@@ -535,7 +531,6 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
- bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -560,13 +555,13 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
- bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
brelse(bp);
}
while (error == NFSERR_BAD_COOKIE) {
+ printf("got bad cookie vp %p bp %p\n", vp, bp);
nfs_invaldir(vp);
error = nfs_vinvalbuf(vp, 0, cred, p, 1);
/*
@@ -574,6 +569,10 @@ again:
* server. The only way to get the block is by
* reading from the beginning to get all the
* offset cookies.
+ *
+ * Leave the last bp intact unless there is an error.
+ * Loop back up to the while if the error is another
+ * NFSERR_BAD_COOKIE (double yuch!).
*/
for (i = 0; i <= lbn && !error; i++) {
if (np->n_direofoffset
@@ -582,21 +581,32 @@ again:
bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
if (!bp)
return (EINTR);
- if ((bp->b_flags & B_DONE) == 0) {
- bp->b_flags |= B_READ;
- bp->b_flags &= ~B_DONE;
- vfs_busy_pages(bp, 0);
- error = nfs_doio(bp, cred, p);
- if (error == 0 && (bp->b_flags & B_INVAL))
- break;
- if (error) {
- brelse(bp);
- } else if (i < lbn) {
- brelse(bp);
- }
+ if ((bp->b_flags & B_CACHE) == 0) {
+ bp->b_flags |= B_READ;
+ vfs_busy_pages(bp, 0);
+ error = nfs_doio(bp, cred, p);
+ /*
+ * no error + B_INVAL == directory EOF,
+ * use the block.
+ */
+ if (error == 0 && (bp->b_flags & B_INVAL))
+ break;
}
+ /*
+ * An error will throw away the block and the
+ * for loop will break out. If no error and this
+ * is not the block we want, we throw away the
+ * block and go for the next one via the for loop.
+ */
+ if (error || i < lbn)
+ brelse(bp);
}
}
+ /*
+ * The above while is repeated if we hit another cookie
+ * error. If we hit an error and it wasn't a cookie error,
+ * we give up.
+ */
if (error)
return (error);
}
@@ -616,7 +626,6 @@ again:
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
- rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -629,10 +638,20 @@ again:
}
}
/*
- * Make sure we use a signed variant of min() since
- * the second term may be negative.
+ * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+ * chopped for the EOF condition, we cannot tell how large
+ * NFS directories are going to be until we hit EOF. So
+ * an NFS directory buffer is *not* chopped to its EOF. Now,
+ * it just so happens that b_resid will effectively chop it
+ * to EOF. *BUT* this information is lost if the buffer goes
+ * away and is reconstituted into a B_CACHE state ( due to
+ * being VMIO ) later. So we keep track of the directory eof
+ * in np->n_direofoffset and chop it off as an extra step
+ * right here.
*/
n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+ if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+ n = np->n_direofoffset - uio->uio_offset;
break;
default:
printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@@ -649,6 +668,10 @@ again:
n = 0;
break;
case VDIR:
+ /*
+ * Invalidate buffer if caching is disabled, forcing a
+ * re-read from the remote later.
+ */
if (np->n_flag & NQNFSNONCACHE)
bp->b_flags |= B_INVAL;
break;
@@ -660,24 +683,6 @@ again:
return (error);
}
-static void
-nfs_prot_buf(bp, off, n)
- struct buf *bp;
- int off;
- int n;
-{
- int pindex, boff, end;
-
- if ((bp->b_flags & B_VMIO) == 0)
- return;
-
- end = round_page(off + n);
- for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
- pindex = boff >> PAGE_SHIFT;
- vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
- }
-}
-
/*
* Vnode op for write using bio
*/
@@ -690,18 +695,18 @@ nfs_write(ap)
struct ucred *a_cred;
} */ *ap;
{
- register int biosize;
- register struct uio *uio = ap->a_uio;
+ int biosize;
+ struct uio *uio = ap->a_uio;
struct proc *p = uio->uio_procp;
- register struct vnode *vp = ap->a_vp;
+ struct vnode *vp = ap->a_vp;
struct nfsnode *np = VTONFS(vp);
- register struct ucred *cred = ap->a_cred;
+ struct ucred *cred = ap->a_cred;
int ioflag = ap->a_ioflag;
struct buf *bp;
struct vattr vattr;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
daddr_t lbn;
- int bufsize;
+ int bcount;
int n, on, error = 0, iomode, must_commit;
#ifdef DIAGNOSTIC
@@ -749,12 +754,9 @@ nfs_write(ap)
psignal(p, SIGXFSZ);
return (EFBIG);
}
- /*
- * I use nm_rsize, not nm_wsize so that all buffer cache blocks
- * will be the same size within a filesystem. nfs_writerpc will
- * still use nm_wsize when sizing the rpc's.
- */
+
biosize = vp->v_mount->mnt_stat.f_iosize;
+
do {
/*
* Check for a valid write lease.
@@ -786,17 +788,74 @@ nfs_write(ap)
on = uio->uio_offset & (biosize-1);
n = min((unsigned)(biosize - on), uio->uio_resid);
again:
- if (uio->uio_offset + n > np->n_size) {
+ /*
+ * Handle direct append and file extension cases, calculate
+ * unaligned buffer size.
+ */
+
+ if (uio->uio_offset == np->n_size && n) {
+ /*
+ * special append case. Obtain buffer prior to
+ * resizing it to maintain B_CACHE.
+ */
+ long save;
+
+ bcount = on;
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
+ save = bp->b_flags & B_CACHE;
+
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
vnode_pager_setsize(vp, np->n_size);
+
+ bcount += n;
+ allocbuf(bp, bcount);
+ bp->b_flags |= save;
+ } else {
+ if (uio->uio_offset + n > np->n_size) {
+ np->n_size = uio->uio_offset + n;
+ np->n_flag |= NMODIFIED;
+ vnode_pager_setsize(vp, np->n_size);
+ }
+ bcount = biosize;
+ if ((off_t)(lbn + 1) * biosize > np->n_size)
+ bcount = np->n_size - (off_t)lbn * biosize;
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
+ }
+
+ /*
+ * Issue a READ if B_CACHE is not set. In special-append
+ * mode, B_CACHE is based on the buffer prior to the write
+ * op and is typically set, avoiding the read. If a read
+ * is required in special append mode, the server will
+ * probably send us a short-read since we extended the file
+ * on our end, resulting in b_resid == 0 and, thusly,
+ * B_CACHE getting set.
+ *
+ * We can also avoid issuing the read if the write covers
+ * the entire buffer. We have to make sure the buffer state
+ * is reasonable in this case since we will not be initiating
+ * I/O. See the comments in kern/vfs_bio.c's getblk() for
+ * more information.
+ *
+ * B_CACHE may also be set due to the buffer being cached
+ * normally.
+ */
+
+ if (on == 0 && n == bcount) {
+ bp->b_flags |= B_CACHE;
+ bp->b_flags &= ~(B_ERROR | B_INVAL);
}
- bufsize = biosize;
- if ((off_t)(lbn + 1) * biosize > np->n_size) {
- bufsize = np->n_size - (off_t)lbn * biosize;
- bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+ if ((bp->b_flags & B_CACHE) == 0) {
+ bp->b_flags |= B_READ;
+ vfs_busy_pages(bp, 0);
+ error = nfs_doio(bp, cred, p);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
}
- bp = nfs_getcacheblk(vp, lbn, bufsize, p);
if (!bp)
return (EINTR);
if (bp->b_wcred == NOCRED) {
@@ -820,6 +879,17 @@ again:
* If the new write will leave a contiguous dirty
* area, just update the b_dirtyoff and b_dirtyend,
* otherwise force a write rpc of the old dirty area.
+ *
+ * While it is possible to merge discontiguous writes due to
+ * our having a B_CACHE buffer ( and thus valid read data
+ * for the hole), we don't because it could lead to
+ * significant cache coherency problems with multiple clients,
+ * especially if locking is implemented later on.
+ *
+ * as an optimization we could theoretically maintain
+ * a linked list of discontinuous areas, but we would still
+ * have to commit them separately so there isn't much
+ * advantage to it except perhaps a bit of asynchronization.
*/
if (bp->b_dirtyend > 0 &&
@@ -863,11 +933,6 @@ again:
}
/*
- * This will keep the buffer and mmaped regions more coherent.
- */
- nfs_prot_buf(bp, on, n);
-
- /*
* Only update dirtyoff/dirtyend if not a degenerate
* condition.
*/
@@ -879,21 +944,7 @@ again:
bp->b_dirtyoff = on;
bp->b_dirtyend = on + n;
}
- }
-
- /*
- * To avoid code complexity, we may have to throw away
- * previously valid ranges when merging the new dirty range
- * into the valid range. As long as we do not *ADD* an
- * invalid valid range, we are ok.
- */
- if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
- bp->b_validoff > bp->b_dirtyend) {
- bp->b_validoff = bp->b_dirtyoff;
- bp->b_validend = bp->b_dirtyend;
- } else {
- bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
- bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+ vfs_bio_set_validclean(bp, on, n);
}
/*
@@ -904,11 +955,14 @@ again:
/*
* If the lease is non-cachable or IO_SYNC do bwrite().
+ *
+ * IO_INVAL appears to be unused. The idea appears to be
+ * to turn off caching in this case. Very odd. XXX
*/
if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
bp->b_proc = p;
if (ioflag & IO_INVAL)
- bp->b_flags |= B_INVAL;
+ bp->b_flags |= B_NOCACHE;
error = VOP_BWRITE(bp);
if (error)
return (error);
@@ -922,8 +976,9 @@ again:
bp->b_proc = (struct proc *)0;
bp->b_flags |= B_ASYNC;
(void)nfs_writebp(bp, 0);
- } else
+ } else {
bdwrite(bp);
+ }
} while (uio->uio_resid > 0 && n > 0);
return (0);
}
@@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
return ((struct buf *)0);
bp = getblk(vp, bn, size, 0, 2 * hz);
}
- } else
+ } else {
bp = getblk(vp, bn, size, 0, 0);
+ }
if (vp->v_type == VREG) {
int biosize;
+
biosize = mp->mnt_stat.f_iosize;
bp->b_blkno = bn * (biosize / DEV_BSIZE);
}
-
return (bp);
}
@@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
* Initiate asynchronous I/O. Return an error if no nfsiods are available.
* This is mainly to avoid queueing async I/O requests when the nfsiods
* are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
*/
int
nfs_asyncio(bp, cred)
@@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
struct vnode *vp;
struct nfsnode *np;
struct nfsmount *nmp;
- int error = 0, diff, len, iomode, must_commit = 0;
+ int error = 0, iomode, must_commit = 0;
struct uio uio;
struct iovec io;
@@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_procp = p;
+ /*
+ * clear B_ERROR and B_INVAL state prior to initiating the I/O. We
+ * do this here so we do not have to do it in all the code that
+ * calls us.
+ */
+ bp->b_flags &= ~(B_ERROR | B_INVAL);
+
KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
/*
@@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
nfsstats.read_bios++;
error = nfs_readrpc(vp, uiop, cr);
if (!error) {
- bp->b_validoff = 0;
if (uiop->uio_resid) {
/*
- * If len > 0, there is a hole in the file and
- * no writes after the hole have been pushed to
- * the server yet.
- * Just zero fill the rest of the valid area.
+ * If we had a short read with no error, we must have
+ * hit a file hole. We should zero-fill the remainder.
+ * This can also occur if the server hits the file EOF.
+ *
+ * Holes used to be able to occur due to pending
+ * writes, but that is not possible any longer.
*/
- diff = bp->b_bcount - uiop->uio_resid;
- len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
- + diff);
- if (len > 0) {
- len = min(len, uiop->uio_resid);
- bzero((char *)bp->b_data + diff, len);
- bp->b_validend = diff + len;
- } else
- bp->b_validend = diff;
- } else
- bp->b_validend = bp->b_bcount;
+ int nread = bp->b_bcount - uiop->uio_resid;
+ int left = bp->b_bcount - nread;
+
+ if (left > 0)
+ bzero((char *)bp->b_data + nread, left);
+ uiop->uio_resid = 0;
+ }
}
if (p && (vp->v_flag & VTEXT) &&
(((nmp->nm_flag & NFSMNT_NQNFS) &&
@@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
}
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
error = nfs_readdirrpc(vp, uiop, cr);
+ /*
+ * end-of-directory sets B_INVAL but does not generate an
+ * error.
+ */
if (error == 0 && uiop->uio_resid == bp->b_bcount)
bp->b_flags |= B_INVAL;
break;
@@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
if (!error && iomode == NFSV3WRITE_UNSTABLE) {
bp->b_flags |= B_NEEDCOMMIT;
if (bp->b_dirtyoff == 0
- && bp->b_dirtyend == bp->b_bufsize)
+ && bp->b_dirtyend == bp->b_bcount)
bp->b_flags |= B_CLUSTEROK;
} else {
bp->b_flags &= ~B_NEEDCOMMIT;
diff --git a/sys/nfs/nfs_nqlease.c b/sys/nfs/nfs_nqlease.c
index 71f692a..e45c73f 100644
--- a/sys/nfs/nfs_nqlease.c
+++ b/sys/nfs/nfs_nqlease.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_nqlease.c 8.9 (Berkeley) 5/20/95
- * $Id: nfs_nqlease.c,v 1.39 1998/10/31 15:31:25 peter Exp $
+ * $Id: nfs_nqlease.c,v 1.40 1999/02/25 00:03:51 peter Exp $
*/
@@ -561,6 +561,10 @@ nqsrv_send_eviction(vp, lp, slp, nam, cred)
*mtod(m, u_int32_t *) = htonl(0x80000000 |
(m->m_pkthdr.len - NFSX_UNSIGNED));
}
+ /*
+ * nfs_sndlock if PR_CONNREQUIRED XXX
+ */
+
if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 &&
(lph->lph_slp->ns_flag & SLP_VALID) == 0) ||
(nfs_slplock(lph->lph_slp, 0) == 0))
diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c
index 1490f72..2267629 100644
--- a/sys/nfs/nfs_socket.c
+++ b/sys/nfs/nfs_socket.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
*/
/*
@@ -54,6 +54,7 @@
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/tprintf.h>
+#include <sys/sysctl.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
0, 0, 0,
};
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
/*
* There is a congestion window for outstanding rpcs maintained per mount
* point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle nfs_timer_handle;
static int nfs_msg __P((struct proc *,char *,char *));
static int nfs_rcvlock __P((struct nfsreq *));
static void nfs_rcvunlock __P((struct nfsreq *));
-static void nfs_realign __P((struct mbuf *m, int hsiz));
+static void nfs_realign __P((struct mbuf **pm, int hsiz));
static int nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
struct mbuf **mp));
static int nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
* These could cause pointer alignment problems, so copy them to
* well aligned mbufs.
*/
- nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+ nfs_realign(mp, 5 * NFSX_UNSIGNED);
return (error);
}
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
}
/*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ * nfs_realign:
+ *
+ * Check for badly aligned mbuf data and realign by copying the unaligned
+ * portion of the data into a new mbuf chain and freeing the portions
+ * of the old chain that were replaced.
+ *
+ * We cannot simply realign the data within the existing mbuf chain
+ * because the underlying buffers may contain other rpc commands and
+ * we cannot afford to overwrite them.
+ *
+ * We would prefer to avoid this situation entirely. The situation does
+ * not occur with NFS/UDP and is supposed to only occassionally occur
+ * with TCP. Use vfs.nfs.realign_count and realign_test to check this.
*/
static void
-nfs_realign(m, hsiz)
- register struct mbuf *m;
+nfs_realign(pm, hsiz)
+ register struct mbuf **pm;
int hsiz;
{
- register struct mbuf *m2;
- register int siz, mlen, olen;
- register caddr_t tcp, fcp;
- struct mbuf *mnew;
+ struct mbuf *m;
+ struct mbuf *n = NULL;
+ int off = 0;
- while (m) {
- /*
- * This never happens for UDP, rarely happens for TCP
- * but frequently happens for iso transport.
- */
- if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
- olen = m->m_len;
- fcp = mtod(m, caddr_t);
- if ((intptr_t)fcp & 0x3) {
- m->m_flags &= ~M_PKTHDR;
- if (m->m_flags & M_EXT)
- m->m_data = m->m_ext.ext_buf +
- ((m->m_ext.ext_size - olen) & ~0x3);
- else
- m->m_data = m->m_dat;
- }
- m->m_len = 0;
- tcp = mtod(m, caddr_t);
- mnew = m;
- m2 = m->m_next;
+ ++nfs_realign_test;
- /*
- * If possible, only put the first invariant part
- * of the RPC header in the first mbuf.
- */
- mlen = M_TRAILINGSPACE(m);
- if (olen <= hsiz && mlen > hsiz)
- mlen = hsiz;
-
- /*
- * Loop through the mbuf list consolidating data.
- */
- while (m) {
- while (olen > 0) {
- if (mlen == 0) {
- m2->m_flags &= ~M_PKTHDR;
- if (m2->m_flags & M_EXT)
- m2->m_data = m2->m_ext.ext_buf;
- else
- m2->m_data = m2->m_dat;
- m2->m_len = 0;
- mlen = M_TRAILINGSPACE(m2);
- tcp = mtod(m2, caddr_t);
- mnew = m2;
- m2 = m2->m_next;
- }
- siz = min(mlen, olen);
- if (tcp != fcp)
- bcopy(fcp, tcp, siz);
- mnew->m_len += siz;
- mlen -= siz;
- olen -= siz;
- tcp += siz;
- fcp += siz;
- }
- m = m->m_next;
- if (m) {
- olen = m->m_len;
- fcp = mtod(m, caddr_t);
+ while ((m = *pm) != NULL) {
+ if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+ MGET(n, M_WAIT, MT_DATA);
+ if (m->m_len >= MINCLSIZE) {
+ MCLGET(n, M_WAIT);
}
+ n->m_len = 0;
+ break;
}
+ pm = &m->m_next;
+ }
- /*
- * Finally, set m_len == 0 for any trailing mbufs that have
- * been copied out of.
- */
- while (m2) {
- m2->m_len = 0;
- m2 = m2->m_next;
+ /*
+ * If n is non-NULL, loop on m copying data, then replace the
+ * portion of the chain that had to be realigned.
+ */
+ if (n != NULL) {
+ ++nfs_realign_count;
+ while (m) {
+ m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+ off += m->m_len;
+ m = m->m_next;
}
- return;
- }
- m = m->m_next;
+ m_freem(*pm);
+ *pm = n;
}
}
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
m_freem(mp);
continue;
}
- nfs_realign(mp, 10 * NFSX_UNSIGNED);
+ nfs_realign(&mp, 10 * NFSX_UNSIGNED);
rec->nr_address = nam;
rec->nr_packet = mp;
STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
if (!rec) {
m_freem(slp->ns_frag);
} else {
- nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+ nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
rec->nr_address = (struct sockaddr *)0;
rec->nr_packet = slp->ns_frag;
STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index a92bb22..6114d56 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
*/
@@ -408,9 +408,9 @@ nfs_access(ap)
error = nfs_readrpc(vp, &auio, ap->a_cred);
else if (vp->v_type == VDIR) {
char* bp;
- bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+ bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
aiov.iov_base = bp;
- aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+ aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
error = nfs_readdirrpc(vp, &auio, ap->a_cred);
free(bp, M_TEMP);
} else if (vp->v_type == VLNK)
@@ -962,7 +962,7 @@ nfs_read(ap)
if (vp->v_type != VREG)
return (EPERM);
- return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+ return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
}
/*
@@ -980,7 +980,7 @@ nfs_readlink(ap)
if (vp->v_type != VLNK)
return (EINVAL);
- return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+ return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
}
/*
@@ -1985,7 +1985,7 @@ nfs_readdir(ap)
* Call nfs_bioread() to do the real work.
*/
tresid = uio->uio_resid;
- error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+ error = nfs_bioread(vp, uio, 0, ap->a_cred);
if (!error && uio->uio_resid == tresid)
nfsstats.direofcache_misses++;
@@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)
{
register int len, left;
- register struct dirent *dp;
+ register struct dirent *dp = NULL;
register u_int32_t *tl;
register caddr_t cp;
register int32_t t1, t2;
@@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
int attrflag;
int v3 = NFS_ISV3(vp);
-#ifndef nolint
- dp = (struct dirent *)0;
-#endif
#ifndef DIAGNOSTIC
- if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
- (uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+ if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+ (uiop->uio_resid & (DIRBLKSIZ - 1)))
panic("nfs readdirrpc bad uio");
#endif
@@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
m_freem(mrep);
}
/*
- * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+ * Fill last record, iff any, out to a multiple of DIRBLKSIZ
* by increasing d_reclen for the last record.
*/
if (blksiz > 0) {
@@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
struct vnode *a_bp;
} */ *ap;
{
-
return (nfs_writebp(ap->a_bp, 1));
}
/*
* This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set
+ * B_CACHE if this is a VMIO buffer.
*/
int
nfs_writebp(bp, force)
@@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
if(!(bp->b_flags & B_BUSY))
panic("bwrite: buffer is not busy???");
- if (bp->b_flags & B_INVAL)
- bp->b_flags |= B_NOCACHE;
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return(0);
+ }
+
+ bp->b_flags |= B_CACHE;
/*
- * XXX we bundirty() the bp here. Shouldn't we do it later after
- * the I/O has completed??
+ * Undirty the bp. We will redirty it later if the I/O fails.
*/
s = splbio();
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs.h 8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
*/
#ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *));
void nfs_safedisconnect __P((struct nfsmount *));
int nfs_getattrcache __P((struct vnode *, struct vattr *));
int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
- int));
+int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
void nfsrv_init __P((int));
void nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index cef982b..0d8a782 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
*/
@@ -65,7 +65,6 @@
static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));
extern int nfs_numasync;
extern int nfs_pbuf_freecnt;
@@ -84,7 +83,7 @@ nfs_getpages(ap)
vm_ooffset_t a_offset;
} */ *ap;
{
- int i, error, nextoff, size, toff, npages, count;
+ int i, error, nextoff, size, toff, count, npages;
struct uio uio;
struct iovec iov;
vm_offset_t kva;
@@ -110,13 +109,35 @@ nfs_getpages(ap)
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
(void)nfs_fsinfo(nmp, vp, cred, p);
+
+ npages = btoc(count);
+
+ /*
+ * If the requested page is partially valid, just return it and
+ * allow the pager to zero-out the blanks. Partially valid pages
+ * can only occur at the file EOF.
+ */
+
+ {
+ vm_page_t m = pages[ap->a_reqpage];
+
+ if (m->valid != 0) {
+ /* handled by vm_fault now */
+ /* vm_page_zero_invalid(m, TRUE); */
+ for (i = 0; i < npages; ++i) {
+ if (i != ap->a_reqpage)
+ vnode_pager_freepage(pages[i]);
+ }
+ return(0);
+ }
+ }
+
/*
* We use only the kva address for the buffer, but this is extremely
* convienient and fast.
*/
bp = getpbuf(&nfs_pbuf_freecnt);
- npages = btoc(count);
kva = (vm_offset_t) bp->b_data;
pmap_qenter(kva, pages, npages);
@@ -167,12 +188,12 @@ nfs_getpages(ap)
m->dirty = 0;
} else if (size > toff) {
/*
- * Read operation filled a partial page, set valid
- * bits properly. validclean will zero out
- * any cruft in the buffer when setting a valid bit,
- * if the size is not DEV_BSIZE aligned.
+ * Read operation filled a partial page.
*/
+ m->valid = 0;
vm_page_set_validclean(m, 0, size - toff);
+ /* handled by vm_fault now */
+ /* vm_page_zero_invalid(m, TRUE); */
}
if (i != ap->a_reqpage) {
@@ -197,13 +218,6 @@ nfs_getpages(ap)
} else {
vnode_pager_freepage(m);
}
- } else {
- /*
- * This page is being mapped, clear out any other
- * cruft in the invalid areas of the page.
- */
- if (m->valid && m->valid != VM_PAGE_BITS_ALL)
- vm_page_zero_invalid(m, FALSE);
}
}
return 0;
@@ -228,14 +242,17 @@ nfs_putpages(ap)
vm_offset_t kva;
struct buf *bp;
int iomode, must_commit, i, error, npages, count;
+ off_t offset;
int *rtvals;
struct vnode *vp;
struct proc *p;
struct ucred *cred;
struct nfsmount *nmp;
+ struct nfsnode *np;
vm_page_t *pages;
vp = ap->a_vp;
+ np = VTONFS(vp);
p = curproc; /* XXX */
cred = curproc->p_ucred; /* XXX */
nmp = VFSTONFS(vp->v_mount);
@@ -243,6 +260,7 @@ nfs_putpages(ap)
count = ap->a_count;
rtvals = ap->a_rtvals;
npages = btoc(count);
+ offset = IDX_TO_OFF(pages[0]->pindex);
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@@ -253,6 +271,16 @@ nfs_putpages(ap)
}
/*
+ * When putting pages, do not extend file past EOF.
+ */
+
+ if (offset + count > np->n_size) {
+ count = np->n_size - offset;
+ if (count < 0)
+ count = 0;
+ }
+
+ /*
* We use only the kva address for the buffer, but this is extremely
* convienient and fast.
*/
@@ -265,7 +293,7 @@ nfs_putpages(ap)
iov.iov_len = count;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
- uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+ uio.uio_offset = offset;
uio.uio_resid = count;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_WRITE;
@@ -297,23 +325,21 @@ nfs_putpages(ap)
* Vnode op for read using bio
*/
int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
register struct vnode *vp;
register struct uio *uio;
int ioflag;
struct ucred *cred;
- int getpages;
{
register struct nfsnode *np = VTONFS(vp);
register int biosize, i;
- off_t diff;
struct buf *bp = 0, *rabp;
struct vattr vattr;
struct proc *p;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
daddr_t lbn, rabn;
- int bufsize;
- int nra, error = 0, n = 0, on = 0, not_readin;
+ int bcount;
+ int nra, error = 0, n = 0, on = 0;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ)
@@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
nfsstats.biocache_reads++;
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
- not_readin = 1;
/*
* Start the read ahead(s), as required.
@@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
return (EINTR);
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
- rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
}
/*
- * If the block is in the cache and has the required data
- * in a valid region, just copy it out.
- * Otherwise, get the block and write back/read in,
- * as required.
+ * Obtain the buffer cache block. Figure out the buffer size
+ * when we are at EOF. nfs_getcacheblk() will also force
+ * uncached delayed-writes to be flushed to the server.
+ *
+ * Note that bcount is *not* DEV_BSIZE aligned.
*/
-again:
- bufsize = biosize;
- if ((off_t)(lbn + 1) * biosize > np->n_size &&
- (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
- bufsize = np->n_size - (off_t)lbn * biosize;
- bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+ bcount = biosize;
+ if ((off_t)lbn * biosize >= np->n_size) {
+ bcount = 0;
+ } else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+ bcount = np->n_size - (off_t)lbn * biosize;
}
- bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
if (!bp)
return (EINTR);
/*
- * If we are being called from nfs_getpages, we must
- * make sure the buffer is a vmio buffer. The vp will
- * already be setup for vmio but there may be some old
- * non-vmio buffers attached to it.
+ * If B_CACHE is not set, we must issue the read. If this
+ * fails, we return an error.
*/
- if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
- printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
- bp->b_flags |= B_NOCACHE;
- bp->b_flags |= B_INVAFTERWRITE;
- if (bp->b_dirtyend > 0) {
- if ((bp->b_flags & B_DELWRI) == 0)
- panic("nfsbioread");
- if (VOP_BWRITE(bp) == EINTR)
- return (EINTR);
- } else
- brelse(bp);
- goto again;
- }
+
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
- bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
- not_readin = 0;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -501,32 +509,20 @@ again:
return (error);
}
}
- if (bufsize > on) {
- n = min((unsigned)(bufsize - on), uio->uio_resid);
- } else {
- n = 0;
- }
- diff = np->n_size - uio->uio_offset;
- if (diff < n)
- n = diff;
- if (not_readin && n > 0) {
- if (on < bp->b_validoff || (on + n) > bp->b_validend) {
- bp->b_flags |= B_NOCACHE;
- bp->b_flags |= B_INVAFTERWRITE;
- if (bp->b_dirtyend > 0) {
- if ((bp->b_flags & B_DELWRI) == 0)
- panic("nfsbioread");
- if (VOP_BWRITE(bp) == EINTR)
- return (EINTR);
- } else
- brelse(bp);
- goto again;
- }
- }
+
+ /*
+ * on is the offset into the current bp. Figure out how many
+ * bytes we can copy out of the bp. Note that bcount is
+ * NOT DEV_BSIZE aligned.
+ *
+ * Then figure out how many bytes we can copy into the uio.
+ */
+
+ n = 0;
+ if (on < bcount)
+ n = min((unsigned)(bcount - on), uio->uio_resid);
+
vp->v_lastr = lbn;
- diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
- if (diff < n)
- n = diff;
break;
case VLNK:
nfsstats.biocache_readlinks++;
@@ -535,7 +531,6 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
- bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
@@ -560,13 +555,13 @@ again:
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
- bp->b_flags &= ~B_DONE;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
brelse(bp);
}
while (error == NFSERR_BAD_COOKIE) {
+ printf("got bad cookie vp %p bp %p\n", vp, bp);
nfs_invaldir(vp);
error = nfs_vinvalbuf(vp, 0, cred, p, 1);
/*
@@ -574,6 +569,10 @@ again:
* server. The only way to get the block is by
* reading from the beginning to get all the
* offset cookies.
+ *
+ * Leave the last bp intact unless there is an error.
+ * Loop back up to the while if the error is another
+ * NFSERR_BAD_COOKIE (double yuch!).
*/
for (i = 0; i <= lbn && !error; i++) {
if (np->n_direofoffset
@@ -582,21 +581,32 @@ again:
bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
if (!bp)
return (EINTR);
- if ((bp->b_flags & B_DONE) == 0) {
- bp->b_flags |= B_READ;
- bp->b_flags &= ~B_DONE;
- vfs_busy_pages(bp, 0);
- error = nfs_doio(bp, cred, p);
- if (error == 0 && (bp->b_flags & B_INVAL))
- break;
- if (error) {
- brelse(bp);
- } else if (i < lbn) {
- brelse(bp);
- }
+ if ((bp->b_flags & B_CACHE) == 0) {
+ bp->b_flags |= B_READ;
+ vfs_busy_pages(bp, 0);
+ error = nfs_doio(bp, cred, p);
+ /*
+ * no error + B_INVAL == directory EOF,
+ * use the block.
+ */
+ if (error == 0 && (bp->b_flags & B_INVAL))
+ break;
}
+ /*
+ * An error will throw away the block and the
+ * for loop will break out. If no error and this
+ * is not the block we want, we throw away the
+ * block and go for the next one via the for loop.
+ */
+ if (error || i < lbn)
+ brelse(bp);
}
}
+ /*
+ * The above while is repeated if we hit another cookie
+ * error. If we hit an error and it wasn't a cookie error,
+ * we give up.
+ */
if (error)
return (error);
}
@@ -616,7 +626,6 @@ again:
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
rabp->b_flags |= (B_READ | B_ASYNC);
- rabp->b_flags &= ~B_DONE;
vfs_busy_pages(rabp, 0);
if (nfs_asyncio(rabp, cred)) {
rabp->b_flags |= B_INVAL|B_ERROR;
@@ -629,10 +638,20 @@ again:
}
}
/*
- * Make sure we use a signed variant of min() since
- * the second term may be negative.
+ * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+ * chopped for the EOF condition, we cannot tell how large
+ * NFS directories are going to be until we hit EOF. So
+ * an NFS directory buffer is *not* chopped to its EOF. Now,
+ * it just so happens that b_resid will effectively chop it
+ * to EOF. *BUT* this information is lost if the buffer goes
+ * away and is reconstituted into a B_CACHE state ( due to
+ * being VMIO ) later. So we keep track of the directory eof
+ * in np->n_direofoffset and chop it off as an extra step
+ * right here.
*/
n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+ if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+ n = np->n_direofoffset - uio->uio_offset;
break;
default:
printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@@ -649,6 +668,10 @@ again:
n = 0;
break;
case VDIR:
+ /*
+ * Invalidate buffer if caching is disabled, forcing a
+ * re-read from the remote later.
+ */
if (np->n_flag & NQNFSNONCACHE)
bp->b_flags |= B_INVAL;
break;
@@ -660,24 +683,6 @@ again:
return (error);
}
-static void
-nfs_prot_buf(bp, off, n)
- struct buf *bp;
- int off;
- int n;
-{
- int pindex, boff, end;
-
- if ((bp->b_flags & B_VMIO) == 0)
- return;
-
- end = round_page(off + n);
- for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
- pindex = boff >> PAGE_SHIFT;
- vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
- }
-}
-
/*
* Vnode op for write using bio
*/
@@ -690,18 +695,18 @@ nfs_write(ap)
struct ucred *a_cred;
} */ *ap;
{
- register int biosize;
- register struct uio *uio = ap->a_uio;
+ int biosize;
+ struct uio *uio = ap->a_uio;
struct proc *p = uio->uio_procp;
- register struct vnode *vp = ap->a_vp;
+ struct vnode *vp = ap->a_vp;
struct nfsnode *np = VTONFS(vp);
- register struct ucred *cred = ap->a_cred;
+ struct ucred *cred = ap->a_cred;
int ioflag = ap->a_ioflag;
struct buf *bp;
struct vattr vattr;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
daddr_t lbn;
- int bufsize;
+ int bcount;
int n, on, error = 0, iomode, must_commit;
#ifdef DIAGNOSTIC
@@ -749,12 +754,9 @@ nfs_write(ap)
psignal(p, SIGXFSZ);
return (EFBIG);
}
- /*
- * I use nm_rsize, not nm_wsize so that all buffer cache blocks
- * will be the same size within a filesystem. nfs_writerpc will
- * still use nm_wsize when sizing the rpc's.
- */
+
biosize = vp->v_mount->mnt_stat.f_iosize;
+
do {
/*
* Check for a valid write lease.
@@ -786,17 +788,74 @@ nfs_write(ap)
on = uio->uio_offset & (biosize-1);
n = min((unsigned)(biosize - on), uio->uio_resid);
again:
- if (uio->uio_offset + n > np->n_size) {
+ /*
+ * Handle direct append and file extension cases, calculate
+ * unaligned buffer size.
+ */
+
+ if (uio->uio_offset == np->n_size && n) {
+ /*
+ * special append case. Obtain buffer prior to
+ * resizing it to maintain B_CACHE.
+ */
+ long save;
+
+ bcount = on;
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
+ save = bp->b_flags & B_CACHE;
+
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
vnode_pager_setsize(vp, np->n_size);
+
+ bcount += n;
+ allocbuf(bp, bcount);
+ bp->b_flags |= save;
+ } else {
+ if (uio->uio_offset + n > np->n_size) {
+ np->n_size = uio->uio_offset + n;
+ np->n_flag |= NMODIFIED;
+ vnode_pager_setsize(vp, np->n_size);
+ }
+ bcount = biosize;
+ if ((off_t)(lbn + 1) * biosize > np->n_size)
+ bcount = np->n_size - (off_t)lbn * biosize;
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
+ }
+
+ /*
+ * Issue a READ if B_CACHE is not set. In special-append
+ * mode, B_CACHE is based on the buffer prior to the write
+ * op and is typically set, avoiding the read. If a read
+ * is required in special append mode, the server will
+ * probably send us a short-read since we extended the file
+ * on our end, resulting in b_resid == 0 and, thusly,
+ * B_CACHE getting set.
+ *
+ * We can also avoid issuing the read if the write covers
+ * the entire buffer. We have to make sure the buffer state
+ * is reasonable in this case since we will not be initiating
+ * I/O. See the comments in kern/vfs_bio.c's getblk() for
+ * more information.
+ *
+ * B_CACHE may also be set due to the buffer being cached
+ * normally.
+ */
+
+ if (on == 0 && n == bcount) {
+ bp->b_flags |= B_CACHE;
+ bp->b_flags &= ~(B_ERROR | B_INVAL);
}
- bufsize = biosize;
- if ((off_t)(lbn + 1) * biosize > np->n_size) {
- bufsize = np->n_size - (off_t)lbn * biosize;
- bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+ if ((bp->b_flags & B_CACHE) == 0) {
+ bp->b_flags |= B_READ;
+ vfs_busy_pages(bp, 0);
+ error = nfs_doio(bp, cred, p);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
}
- bp = nfs_getcacheblk(vp, lbn, bufsize, p);
if (!bp)
return (EINTR);
if (bp->b_wcred == NOCRED) {
@@ -820,6 +879,17 @@ again:
* If the new write will leave a contiguous dirty
* area, just update the b_dirtyoff and b_dirtyend,
* otherwise force a write rpc of the old dirty area.
+ *
+ * While it is possible to merge discontiguous writes due to
+ * our having a B_CACHE buffer ( and thus valid read data
+ * for the hole), we don't because it could lead to
+ * significant cache coherency problems with multiple clients,
+ * especially if locking is implemented later on.
+ *
+ * as an optimization we could theoretically maintain
+ * a linked list of discontinuous areas, but we would still
+ * have to commit them separately so there isn't much
+ * advantage to it except perhaps a bit of asynchronization.
*/
if (bp->b_dirtyend > 0 &&
@@ -863,11 +933,6 @@ again:
}
/*
- * This will keep the buffer and mmaped regions more coherent.
- */
- nfs_prot_buf(bp, on, n);
-
- /*
* Only update dirtyoff/dirtyend if not a degenerate
* condition.
*/
@@ -879,21 +944,7 @@ again:
bp->b_dirtyoff = on;
bp->b_dirtyend = on + n;
}
- }
-
- /*
- * To avoid code complexity, we may have to throw away
- * previously valid ranges when merging the new dirty range
- * into the valid range. As long as we do not *ADD* an
- * invalid valid range, we are ok.
- */
- if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
- bp->b_validoff > bp->b_dirtyend) {
- bp->b_validoff = bp->b_dirtyoff;
- bp->b_validend = bp->b_dirtyend;
- } else {
- bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
- bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+ vfs_bio_set_validclean(bp, on, n);
}
/*
@@ -904,11 +955,14 @@ again:
/*
* If the lease is non-cachable or IO_SYNC do bwrite().
+ *
+ * IO_INVAL appears to be unused. The idea appears to be
+ * to turn off caching in this case. Very odd. XXX
*/
if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
bp->b_proc = p;
if (ioflag & IO_INVAL)
- bp->b_flags |= B_INVAL;
+ bp->b_flags |= B_NOCACHE;
error = VOP_BWRITE(bp);
if (error)
return (error);
@@ -922,8 +976,9 @@ again:
bp->b_proc = (struct proc *)0;
bp->b_flags |= B_ASYNC;
(void)nfs_writebp(bp, 0);
- } else
+ } else {
bdwrite(bp);
+ }
} while (uio->uio_resid > 0 && n > 0);
return (0);
}
@@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
return ((struct buf *)0);
bp = getblk(vp, bn, size, 0, 2 * hz);
}
- } else
+ } else {
bp = getblk(vp, bn, size, 0, 0);
+ }
if (vp->v_type == VREG) {
int biosize;
+
biosize = mp->mnt_stat.f_iosize;
bp->b_blkno = bn * (biosize / DEV_BSIZE);
}
-
return (bp);
}
@@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
* Initiate asynchronous I/O. Return an error if no nfsiods are available.
* This is mainly to avoid queueing async I/O requests when the nfsiods
* are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
*/
int
nfs_asyncio(bp, cred)
@@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
struct vnode *vp;
struct nfsnode *np;
struct nfsmount *nmp;
- int error = 0, diff, len, iomode, must_commit = 0;
+ int error = 0, iomode, must_commit = 0;
struct uio uio;
struct iovec io;
@@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_procp = p;
+ /*
+ * clear B_ERROR and B_INVAL state prior to initiating the I/O. We
+ * do this here so we do not have to do it in all the code that
+ * calls us.
+ */
+ bp->b_flags &= ~(B_ERROR | B_INVAL);
+
KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
/*
@@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
nfsstats.read_bios++;
error = nfs_readrpc(vp, uiop, cr);
if (!error) {
- bp->b_validoff = 0;
if (uiop->uio_resid) {
/*
- * If len > 0, there is a hole in the file and
- * no writes after the hole have been pushed to
- * the server yet.
- * Just zero fill the rest of the valid area.
+ * If we had a short read with no error, we must have
+ * hit a file hole. We should zero-fill the remainder.
+ * This can also occur if the server hits the file EOF.
+ *
+ * Holes used to be able to occur due to pending
+ * writes, but that is not possible any longer.
*/
- diff = bp->b_bcount - uiop->uio_resid;
- len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
- + diff);
- if (len > 0) {
- len = min(len, uiop->uio_resid);
- bzero((char *)bp->b_data + diff, len);
- bp->b_validend = diff + len;
- } else
- bp->b_validend = diff;
- } else
- bp->b_validend = bp->b_bcount;
+ int nread = bp->b_bcount - uiop->uio_resid;
+ int left = bp->b_bcount - nread;
+
+ if (left > 0)
+ bzero((char *)bp->b_data + nread, left);
+ uiop->uio_resid = 0;
+ }
}
if (p && (vp->v_flag & VTEXT) &&
(((nmp->nm_flag & NFSMNT_NQNFS) &&
@@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
}
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
error = nfs_readdirrpc(vp, uiop, cr);
+ /*
+ * end-of-directory sets B_INVAL but does not generate an
+ * error.
+ */
if (error == 0 && uiop->uio_resid == bp->b_bcount)
bp->b_flags |= B_INVAL;
break;
@@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
if (!error && iomode == NFSV3WRITE_UNSTABLE) {
bp->b_flags |= B_NEEDCOMMIT;
if (bp->b_dirtyoff == 0
- && bp->b_dirtyend == bp->b_bufsize)
+ && bp->b_dirtyend == bp->b_bcount)
bp->b_flags |= B_CLUSTEROK;
} else {
bp->b_flags &= ~B_NEEDCOMMIT;
diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c
index 1490f72..2267629 100644
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
*/
/*
@@ -54,6 +54,7 @@
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/tprintf.h>
+#include <sys/sysctl.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
0, 0, 0,
};
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
/*
* There is a congestion window for outstanding rpcs maintained per mount
* point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle nfs_timer_handle;
static int nfs_msg __P((struct proc *,char *,char *));
static int nfs_rcvlock __P((struct nfsreq *));
static void nfs_rcvunlock __P((struct nfsreq *));
-static void nfs_realign __P((struct mbuf *m, int hsiz));
+static void nfs_realign __P((struct mbuf **pm, int hsiz));
static int nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
struct mbuf **mp));
static int nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
* These could cause pointer alignment problems, so copy them to
* well aligned mbufs.
*/
- nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+ nfs_realign(mp, 5 * NFSX_UNSIGNED);
return (error);
}
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
}
/*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ * nfs_realign:
+ *
+ * Check for badly aligned mbuf data and realign by copying the unaligned
+ * portion of the data into a new mbuf chain and freeing the portions
+ * of the old chain that were replaced.
+ *
+ * We cannot simply realign the data within the existing mbuf chain
+ * because the underlying buffers may contain other rpc commands and
+ * we cannot afford to overwrite them.
+ *
+ * We would prefer to avoid this situation entirely. The situation does
+ * not occur with NFS/UDP and is supposed to only occassionally occur
+ * with TCP. Use vfs.nfs.realign_count and realign_test to check this.
*/
static void
-nfs_realign(m, hsiz)
- register struct mbuf *m;
+nfs_realign(pm, hsiz)
+ register struct mbuf **pm;
int hsiz;
{
- register struct mbuf *m2;
- register int siz, mlen, olen;
- register caddr_t tcp, fcp;
- struct mbuf *mnew;
+ struct mbuf *m;
+ struct mbuf *n = NULL;
+ int off = 0;
- while (m) {
- /*
- * This never happens for UDP, rarely happens for TCP
- * but frequently happens for iso transport.
- */
- if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
- olen = m->m_len;
- fcp = mtod(m, caddr_t);
- if ((intptr_t)fcp & 0x3) {
- m->m_flags &= ~M_PKTHDR;
- if (m->m_flags & M_EXT)
- m->m_data = m->m_ext.ext_buf +
- ((m->m_ext.ext_size - olen) & ~0x3);
- else
- m->m_data = m->m_dat;
- }
- m->m_len = 0;
- tcp = mtod(m, caddr_t);
- mnew = m;
- m2 = m->m_next;
+ ++nfs_realign_test;
- /*
- * If possible, only put the first invariant part
- * of the RPC header in the first mbuf.
- */
- mlen = M_TRAILINGSPACE(m);
- if (olen <= hsiz && mlen > hsiz)
- mlen = hsiz;
-
- /*
- * Loop through the mbuf list consolidating data.
- */
- while (m) {
- while (olen > 0) {
- if (mlen == 0) {
- m2->m_flags &= ~M_PKTHDR;
- if (m2->m_flags & M_EXT)
- m2->m_data = m2->m_ext.ext_buf;
- else
- m2->m_data = m2->m_dat;
- m2->m_len = 0;
- mlen = M_TRAILINGSPACE(m2);
- tcp = mtod(m2, caddr_t);
- mnew = m2;
- m2 = m2->m_next;
- }
- siz = min(mlen, olen);
- if (tcp != fcp)
- bcopy(fcp, tcp, siz);
- mnew->m_len += siz;
- mlen -= siz;
- olen -= siz;
- tcp += siz;
- fcp += siz;
- }
- m = m->m_next;
- if (m) {
- olen = m->m_len;
- fcp = mtod(m, caddr_t);
+ while ((m = *pm) != NULL) {
+ if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+ MGET(n, M_WAIT, MT_DATA);
+ if (m->m_len >= MINCLSIZE) {
+ MCLGET(n, M_WAIT);
}
+ n->m_len = 0;
+ break;
}
+ pm = &m->m_next;
+ }
- /*
- * Finally, set m_len == 0 for any trailing mbufs that have
- * been copied out of.
- */
- while (m2) {
- m2->m_len = 0;
- m2 = m2->m_next;
+ /*
+ * If n is non-NULL, loop on m copying data, then replace the
+ * portion of the chain that had to be realigned.
+ */
+ if (n != NULL) {
+ ++nfs_realign_count;
+ while (m) {
+ m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+ off += m->m_len;
+ m = m->m_next;
}
- return;
- }
- m = m->m_next;
+ m_freem(*pm);
+ *pm = n;
}
}
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
m_freem(mp);
continue;
}
- nfs_realign(mp, 10 * NFSX_UNSIGNED);
+ nfs_realign(&mp, 10 * NFSX_UNSIGNED);
rec->nr_address = nam;
rec->nr_packet = mp;
STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
if (!rec) {
m_freem(slp->ns_frag);
} else {
- nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+ nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
rec->nr_address = (struct sockaddr *)0;
rec->nr_packet = slp->ns_frag;
STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index a92bb22..6114d56 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
*/
@@ -408,9 +408,9 @@ nfs_access(ap)
error = nfs_readrpc(vp, &auio, ap->a_cred);
else if (vp->v_type == VDIR) {
char* bp;
- bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+ bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
aiov.iov_base = bp;
- aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+ aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
error = nfs_readdirrpc(vp, &auio, ap->a_cred);
free(bp, M_TEMP);
} else if (vp->v_type == VLNK)
@@ -962,7 +962,7 @@ nfs_read(ap)
if (vp->v_type != VREG)
return (EPERM);
- return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+ return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
}
/*
@@ -980,7 +980,7 @@ nfs_readlink(ap)
if (vp->v_type != VLNK)
return (EINVAL);
- return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+ return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
}
/*
@@ -1985,7 +1985,7 @@ nfs_readdir(ap)
* Call nfs_bioread() to do the real work.
*/
tresid = uio->uio_resid;
- error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+ error = nfs_bioread(vp, uio, 0, ap->a_cred);
if (!error && uio->uio_resid == tresid)
nfsstats.direofcache_misses++;
@@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)
{
register int len, left;
- register struct dirent *dp;
+ register struct dirent *dp = NULL;
register u_int32_t *tl;
register caddr_t cp;
register int32_t t1, t2;
@@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
int attrflag;
int v3 = NFS_ISV3(vp);
-#ifndef nolint
- dp = (struct dirent *)0;
-#endif
#ifndef DIAGNOSTIC
- if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
- (uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+ if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+ (uiop->uio_resid & (DIRBLKSIZ - 1)))
panic("nfs readdirrpc bad uio");
#endif
@@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
m_freem(mrep);
}
/*
- * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+ * Fill last record, iff any, out to a multiple of DIRBLKSIZ
* by increasing d_reclen for the last record.
*/
if (blksiz > 0) {
@@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
struct vnode *a_bp;
} */ *ap;
{
-
return (nfs_writebp(ap->a_bp, 1));
}
/*
* This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set
+ * B_CACHE if this is a VMIO buffer.
*/
int
nfs_writebp(bp, force)
@@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
if(!(bp->b_flags & B_BUSY))
panic("bwrite: buffer is not busy???");
- if (bp->b_flags & B_INVAL)
- bp->b_flags |= B_NOCACHE;
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return(0);
+ }
+
+ bp->b_flags |= B_CACHE;
/*
- * XXX we bundirty() the bp here. Shouldn't we do it later after
- * the I/O has completed??
+ * Undirty the bp. We will redirty it later if the I/O fails.
*/
s = splbio();
diff --git a/sys/nfsclient/nfsargs.h b/sys/nfsclient/nfsargs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfsargs.h
+++ b/sys/nfsclient/nfsargs.h
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs.h 8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
*/
#ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *));
void nfs_safedisconnect __P((struct nfsmount *));
int nfs_getattrcache __P((struct vnode *, struct vattr *));
int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
- int));
+int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
void nfsrv_init __P((int));
void nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsclient/nfsstats.h b/sys/nfsclient/nfsstats.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfsstats.h
+++ b/sys/nfsclient/nfsstats.h
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs.h 8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
*/
#ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *));
void nfs_safedisconnect __P((struct nfsmount *));
int nfs_getattrcache __P((struct vnode *, struct vattr *));
int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
- int));
+int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
void nfsrv_init __P((int));
void nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsserver/nfs.h b/sys/nfsserver/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsserver/nfs.h
+++ b/sys/nfsserver/nfs.h
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs.h 8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
*/
#ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *));
void nfs_safedisconnect __P((struct nfsmount *));
int nfs_getattrcache __P((struct vnode *, struct vattr *));
int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
- int));
+int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
void nfsrv_init __P((int));
void nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c
index 1490f72..2267629 100644
--- a/sys/nfsserver/nfs_srvsock.c
+++ b/sys/nfsserver/nfs_srvsock.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
*/
/*
@@ -54,6 +54,7 @@
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/tprintf.h>
+#include <sys/sysctl.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
0, 0, 0,
};
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
/*
* There is a congestion window for outstanding rpcs maintained per mount
* point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle nfs_timer_handle;
static int nfs_msg __P((struct proc *,char *,char *));
static int nfs_rcvlock __P((struct nfsreq *));
static void nfs_rcvunlock __P((struct nfsreq *));
-static void nfs_realign __P((struct mbuf *m, int hsiz));
+static void nfs_realign __P((struct mbuf **pm, int hsiz));
static int nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
struct mbuf **mp));
static int nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
* These could cause pointer alignment problems, so copy them to
* well aligned mbufs.
*/
- nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+ nfs_realign(mp, 5 * NFSX_UNSIGNED);
return (error);
}
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
}
/*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ * nfs_realign:
+ *
+ * Check for badly aligned mbuf data and realign by copying the unaligned
+ * portion of the data into a new mbuf chain and freeing the portions
+ * of the old chain that were replaced.
+ *
+ * We cannot simply realign the data within the existing mbuf chain
+ * because the underlying buffers may contain other rpc commands and
+ * we cannot afford to overwrite them.
+ *
+ * We would prefer to avoid this situation entirely. The situation does
+ * not occur with NFS/UDP and is supposed to only occassionally occur
+ * with TCP. Use vfs.nfs.realign_count and realign_test to check this.
*/
static void
-nfs_realign(m, hsiz)
- register struct mbuf *m;
+nfs_realign(pm, hsiz)
+ register struct mbuf **pm;
int hsiz;
{
- register struct mbuf *m2;
- register int siz, mlen, olen;
- register caddr_t tcp, fcp;
- struct mbuf *mnew;
+ struct mbuf *m;
+ struct mbuf *n = NULL;
+ int off = 0;
- while (m) {
- /*
- * This never happens for UDP, rarely happens for TCP
- * but frequently happens for iso transport.
- */
- if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
- olen = m->m_len;
- fcp = mtod(m, caddr_t);
- if ((intptr_t)fcp & 0x3) {
- m->m_flags &= ~M_PKTHDR;
- if (m->m_flags & M_EXT)
- m->m_data = m->m_ext.ext_buf +
- ((m->m_ext.ext_size - olen) & ~0x3);
- else
- m->m_data = m->m_dat;
- }
- m->m_len = 0;
- tcp = mtod(m, caddr_t);
- mnew = m;
- m2 = m->m_next;
+ ++nfs_realign_test;
- /*
- * If possible, only put the first invariant part
- * of the RPC header in the first mbuf.
- */
- mlen = M_TRAILINGSPACE(m);
- if (olen <= hsiz && mlen > hsiz)
- mlen = hsiz;
-
- /*
- * Loop through the mbuf list consolidating data.
- */
- while (m) {
- while (olen > 0) {
- if (mlen == 0) {
- m2->m_flags &= ~M_PKTHDR;
- if (m2->m_flags & M_EXT)
- m2->m_data = m2->m_ext.ext_buf;
- else
- m2->m_data = m2->m_dat;
- m2->m_len = 0;
- mlen = M_TRAILINGSPACE(m2);
- tcp = mtod(m2, caddr_t);
- mnew = m2;
- m2 = m2->m_next;
- }
- siz = min(mlen, olen);
- if (tcp != fcp)
- bcopy(fcp, tcp, siz);
- mnew->m_len += siz;
- mlen -= siz;
- olen -= siz;
- tcp += siz;
- fcp += siz;
- }
- m = m->m_next;
- if (m) {
- olen = m->m_len;
- fcp = mtod(m, caddr_t);
+ while ((m = *pm) != NULL) {
+ if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+ MGET(n, M_WAIT, MT_DATA);
+ if (m->m_len >= MINCLSIZE) {
+ MCLGET(n, M_WAIT);
}
+ n->m_len = 0;
+ break;
}
+ pm = &m->m_next;
+ }
- /*
- * Finally, set m_len == 0 for any trailing mbufs that have
- * been copied out of.
- */
- while (m2) {
- m2->m_len = 0;
- m2 = m2->m_next;
+ /*
+ * If n is non-NULL, loop on m copying data, then replace the
+ * portion of the chain that had to be realigned.
+ */
+ if (n != NULL) {
+ ++nfs_realign_count;
+ while (m) {
+ m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+ off += m->m_len;
+ m = m->m_next;
}
- return;
- }
- m = m->m_next;
+ m_freem(*pm);
+ *pm = n;
}
}
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
m_freem(mp);
continue;
}
- nfs_realign(mp, 10 * NFSX_UNSIGNED);
+ nfs_realign(&mp, 10 * NFSX_UNSIGNED);
rec->nr_address = nam;
rec->nr_packet = mp;
STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
if (!rec) {
m_freem(slp->ns_frag);
} else {
- nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+ nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
rec->nr_address = (struct sockaddr *)0;
rec->nr_packet = slp->ns_frag;
STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfsserver/nfsrvstats.h b/sys/nfsserver/nfsrvstats.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsserver/nfsrvstats.h
+++ b/sys/nfsserver/nfsrvstats.h
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)nfs.h 8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
*/
#ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void nfs_disconnect __P((struct nfsmount *));
void nfs_safedisconnect __P((struct nfsmount *));
int nfs_getattrcache __P((struct vnode *, struct vattr *));
int nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
- int));
+int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
void nfsrv_init __P((int));
void nfs_clearcommit __P((struct mount *));
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index d2ce212..2e88ca7 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
*/
#ifndef _SYS_BUF_H_
@@ -78,6 +78,19 @@ struct iodone_chain {
/*
* The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ * b_bufsize, b_bcount. b_bufsize is the allocation size of the
+ * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the
+ * originally requested buffer size and can serve as a bounds check
+ * against EOF. For most, but not all uses, b_bcount == b_bufsize.
+ *
+ * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned
+ * ranges of dirty data that need to be written to backing store.
+ * The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ * b_resid. Number of bytes remaining in I/O. After an I/O operation
+ * completes, b_resid is usually 0 indicating 100% success.
*/
struct buf {
LIST_ENTRY(buf) b_hash; /* Hash chain. */
@@ -109,8 +122,10 @@ struct buf {
int b_dirtyend; /* Offset of end of dirty region. */
struct ucred *b_rcred; /* Read credentials reference. */
struct ucred *b_wcred; /* Write credentials reference. */
+#if 0
int b_validoff; /* Offset in buffer of valid region. */
int b_validend; /* Offset of end of valid region. */
+#endif
daddr_t b_pblkno; /* physical block number */
void *b_saveaddr; /* Original b_addr for physio. */
caddr_t b_savekva; /* saved kva for transfer while bouncing */
@@ -151,9 +166,24 @@ struct buf {
* Buffer vp reassignments are illegal in this case.
*
* B_CACHE This may only be set if the buffer is entirely valid.
- * The situation where B_DELWRI is set and B_CACHE gets
- * cleared MUST be committed to disk so B_DELWRI can
- * also be cleared.
+ * The situation where B_DELWRI is set and B_CACHE is
+ * clear MUST be committed to disk by getblk() so
+ * B_DELWRI can also be cleared. See the comments for
+ * getblk() in kern/vfs_bio.c. If B_CACHE is clear,
+ * the caller is expected to clear B_ERROR|B_INVAL,
+ * set B_READ, and initiate an I/O.
+ *
+ * The 'entire buffer' is defined to be the range from
+ * 0 through b_bcount.
+ *
+ * B_MALLOC Request that the buffer be allocated from the malloc
+ * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ * B_VMIO Indicates that the buffer is tied into an VM object.
+ * The buffer's data is always PAGE_SIZE aligned even
+ * if b_bufsize and b_bcount are not. ( b_bufsize is
+ * always at least DEV_BSIZE aligned, though ).
+ *
*/
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
@@ -356,6 +386,7 @@ void cluster_write __P((struct buf *, u_quad_t));
int physio __P((void (*)(struct buf *), struct buf *, dev_t,
int, u_int (*)(struct buf *), struct uio *));
u_int minphys __P((struct buf *));
+void vfs_bio_set_validclean __P((struct buf *, int base, int size));
void vfs_bio_clrbuf __P((struct buf *));
void vfs_busy_pages __P((struct buf *, int clear_modify));
void vfs_unbusy_pages __P((struct buf *));
@@ -371,6 +402,7 @@ int allocbuf __P((struct buf *bp, int size));
void reassignbuf __P((struct buf *, struct vnode *));
void pbreassignbuf __P((struct buf *, struct vnode *));
struct buf *trypbuf __P((int *));
+
#endif /* KERNEL */
#endif /* !_SYS_BUF_H_ */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index d2ce212..2e88ca7 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
*/
#ifndef _SYS_BUF_H_
@@ -78,6 +78,19 @@ struct iodone_chain {
/*
* The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ * b_bufsize, b_bcount. b_bufsize is the allocation size of the
+ * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the
+ * originally requested buffer size and can serve as a bounds check
+ * against EOF. For most, but not all uses, b_bcount == b_bufsize.
+ *
+ * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned
+ * ranges of dirty data that need to be written to backing store.
+ * The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ * b_resid. Number of bytes remaining in I/O. After an I/O operation
+ * completes, b_resid is usually 0 indicating 100% success.
*/
struct buf {
LIST_ENTRY(buf) b_hash; /* Hash chain. */
@@ -109,8 +122,10 @@ struct buf {
int b_dirtyend; /* Offset of end of dirty region. */
struct ucred *b_rcred; /* Read credentials reference. */
struct ucred *b_wcred; /* Write credentials reference. */
+#if 0
int b_validoff; /* Offset in buffer of valid region. */
int b_validend; /* Offset of end of valid region. */
+#endif
daddr_t b_pblkno; /* physical block number */
void *b_saveaddr; /* Original b_addr for physio. */
caddr_t b_savekva; /* saved kva for transfer while bouncing */
@@ -151,9 +166,24 @@ struct buf {
* Buffer vp reassignments are illegal in this case.
*
* B_CACHE This may only be set if the buffer is entirely valid.
- * The situation where B_DELWRI is set and B_CACHE gets
- * cleared MUST be committed to disk so B_DELWRI can
- * also be cleared.
+ * The situation where B_DELWRI is set and B_CACHE is
+ * clear MUST be committed to disk by getblk() so
+ * B_DELWRI can also be cleared. See the comments for
+ * getblk() in kern/vfs_bio.c. If B_CACHE is clear,
+ * the caller is expected to clear B_ERROR|B_INVAL,
+ * set B_READ, and initiate an I/O.
+ *
+ * The 'entire buffer' is defined to be the range from
+ * 0 through b_bcount.
+ *
+ * B_MALLOC Request that the buffer be allocated from the malloc
+ * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ * B_VMIO Indicates that the buffer is tied into an VM object.
+ * The buffer's data is always PAGE_SIZE aligned even
+ * if b_bufsize and b_bcount are not. ( b_bufsize is
+ * always at least DEV_BSIZE aligned, though ).
+ *
*/
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
@@ -356,6 +386,7 @@ void cluster_write __P((struct buf *, u_quad_t));
int physio __P((void (*)(struct buf *), struct buf *, dev_t,
int, u_int (*)(struct buf *), struct uio *));
u_int minphys __P((struct buf *));
+void vfs_bio_set_validclean __P((struct buf *, int base, int size));
void vfs_bio_clrbuf __P((struct buf *));
void vfs_busy_pages __P((struct buf *, int clear_modify));
void vfs_unbusy_pages __P((struct buf *));
@@ -371,6 +402,7 @@ int allocbuf __P((struct buf *bp, int size));
void reassignbuf __P((struct buf *, struct vnode *));
void pbreassignbuf __P((struct buf *, struct vnode *));
struct buf *trypbuf __P((int *));
+
#endif /* KERNEL */
#endif /* !_SYS_BUF_H_ */
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 8821440..c80d0a5 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95
- * $Id: ffs_inode.c,v 1.52 1999/01/07 16:14:16 bde Exp $
+ * $Id: ffs_inode.c,v 1.53 1999/01/28 00:57:54 dillon Exp $
*/
#include "opt_quota.h"
@@ -452,6 +452,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
if ((bp->b_flags & B_CACHE) == 0) {
curproc->p_stats->p_ru.ru_inblock++; /* pay for read */
bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_ERROR|B_INVAL);
if (bp->b_bcount > bp->b_bufsize)
panic("ffs_indirtrunc: bad buffer size");
bp->b_blkno = dbn;
diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c
index d4d82f0..c9ae4dd 100644
--- a/sys/ufs/mfs/mfs_vnops.c
+++ b/sys/ufs/mfs/mfs_vnops.c
@@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95
- * $Id: mfs_vnops.c,v 1.42 1999/01/28 00:57:55 dillon Exp $
+ * $Id: mfs_vnops.c,v 1.43 1999/04/11 02:28:32 eivind Exp $
*/
#include <sys/param.h>
@@ -127,6 +127,9 @@ mfs_fsync(ap)
* We implement the B_FREEBUF strategy. We can't just madvise()
* here because we have to do it in the correct order vs other bio
* requests, so we queue it.
+ *
+ * Note: geteblk() sets B_INVAL. We leave it set to guarentee buffer
+ * throw-away on brelse()? XXX
*/
static int
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
*/
#include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
#endif
bp->b_blkno = blkptrtodb(ump, daddr);
bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_INVAL|B_ERROR);
vfs_busy_pages(bp, 0);
VOP_STRATEGY(bp->b_vp, bp);
curproc->p_stats->p_ru.ru_inblock++; /* XXX */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 047f10f..882d52e 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -66,7 +66,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $Id: vm_fault.c,v 1.100 1999/02/17 09:08:29 dillon Exp $
+ * $Id: vm_fault.c,v 1.101 1999/02/25 06:00:52 alc Exp $
*/
/*
@@ -409,6 +409,12 @@ readrest:
firstpindex = fs.first_pindex -
2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1);
+ /*
+ * note: partially valid pages cannot be
+ * included in the lookahead - NFS piecemeal
+ * writes will barf on it badly.
+ */
+
for(tmppindex = fs.first_pindex - 1;
tmppindex >= firstpindex;
--tmppindex) {
@@ -552,12 +558,16 @@ readrest:
}
fs.first_m = NULL;
+ /*
+ * Zero the page if necessary and mark it valid.
+ */
if ((fs.m->flags & PG_ZERO) == 0) {
vm_page_zero_fill(fs.m);
- }
- else
+ } else {
cnt.v_ozfod++;
+ }
cnt.v_zfod++;
+ fs.m->valid = VM_PAGE_BITS_ALL;
break; /* break to PAGE HAS BEEN FOUND */
} else {
if (fs.object != fs.first_object) {
@@ -788,14 +798,24 @@ readrest:
#endif
unlock_things(&fs);
- fs.m->valid = VM_PAGE_BITS_ALL;
- vm_page_flag_clear(fs.m, PG_ZERO);
+
+ /*
+ * Sanity check: page must be completely valid or it is not fit to
+ * map into user space. vm_pager_get_pages() ensures this.
+ */
+
+ if (fs.m->valid != VM_PAGE_BITS_ALL) {
+ vm_page_zero_invalid(fs.m, TRUE);
+ printf("Warning: page %p partially invalid on fault\n", fs.m);
+ }
pmap_enter(fs.map->pmap, vaddr, VM_PAGE_TO_PHYS(fs.m), prot, wired);
+
if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) {
pmap_prefault(fs.map->pmap, vaddr, fs.entry);
}
+ vm_page_flag_clear(fs.m, PG_ZERO);
vm_page_flag_set(fs.m, PG_MAPPED|PG_REFERENCED);
if (fault_flags & VM_FAULT_HOLD)
vm_page_hold(fs.m);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index e07ea63..0d85a94 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -34,7 +34,7 @@
* SUCH DAMAGE.
*
* from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
- * $Id: vm_page.c,v 1.128 1999/03/19 05:21:03 alc Exp $
+ * $Id: vm_page.c,v 1.129 1999/04/05 19:38:29 julian Exp $
*/
/*
@@ -1460,14 +1460,16 @@ vm_page_bits(int base, int size)
}
/*
- * set a page valid and clean. May not block.
+ * vm_page_set_validclean:
*
- * In order to maintain consistancy due to the DEV_BSIZE granularity
- * of the valid bits, we have to zero non-DEV_BSIZE aligned portions of
- * the page at the beginning and end of the valid range when the
- * associated valid bits are not already set.
+ * Sets portions of a page valid and clean. The arguments are expected
+ * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
+ * of any partial chunks touched by the range. The invalid portion of
+ * such chunks will be zero'd.
*
- * (base + size) must be less then or equal to PAGE_SIZE.
+ * This routine may not block.
+ *
+ * (base + size) must be less then or equal to PAGE_SIZE.
*/
void
vm_page_set_validclean(m, base, size)
@@ -1529,8 +1531,35 @@ vm_page_set_validclean(m, base, size)
pmap_clear_modify(VM_PAGE_TO_PHYS(m));
}
+#if 0
+
+void
+vm_page_set_dirty(m, base, size)
+ vm_page_t m;
+ int base;
+ int size;
+{
+ m->dirty |= vm_page_bits(base, size);
+}
+
+#endif
+
+void
+vm_page_clear_dirty(m, base, size)
+ vm_page_t m;
+ int base;
+ int size;
+{
+ m->dirty &= ~vm_page_bits(base, size);
+}
+
/*
- * set a page (partially) invalid. May not block.
+ * vm_page_set_invalid:
+ *
+ * Invalidates DEV_BSIZE'd chunks within a page. Both the
+ * valid and dirty bits for the effected areas are cleared.
+ *
+ * May not block.
*/
void
vm_page_set_invalid(m, base, size)
@@ -1540,9 +1569,9 @@ vm_page_set_invalid(m, base, size)
{
int bits;
- m->valid &= ~(bits = vm_page_bits(base, size));
- if (m->valid == 0)
- m->dirty &= ~bits;
+ bits = vm_page_bits(base, size);
+ m->valid &= ~bits;
+ m->dirty &= ~bits;
m->object->generation++;
}
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 8072f66..abff794 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -61,7 +61,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $Id: vm_page.h,v 1.58 1999/03/15 05:09:48 julian Exp $
+ * $Id: vm_page.h,v 1.59 1999/04/05 19:38:29 julian Exp $
*/
/*
@@ -101,6 +101,10 @@
* Fields in this structure are locked either by the lock on the
* object that the page belongs to (O) or by the lock on the page
* queues (P).
+ *
+ * The 'valid' and 'dirty' fields are distinct. A page may have dirty
+ * bits set without having associated valid bits set. This is used by
+ * NFS to implement piecemeal writes.
*/
TAILQ_HEAD(pglist, vm_page);
@@ -404,6 +408,8 @@ void vm_page_wire __P((vm_page_t));
void vm_page_unqueue __P((vm_page_t));
void vm_page_unqueue_nowakeup __P((vm_page_t));
void vm_page_set_validclean __P((vm_page_t, int, int));
+void vm_page_set_dirty __P((vm_page_t, int, int));
+void vm_page_clear_dirty __P((vm_page_t, int, int));
void vm_page_set_invalid __P((vm_page_t, int, int));
static __inline boolean_t vm_page_zero_fill __P((vm_page_t));
int vm_page_is_valid __P((vm_page_t, int, int));
diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c
index 36a905e..dbacceb 100644
--- a/sys/vm/vm_pager.c
+++ b/sys/vm/vm_pager.c
@@ -61,7 +61,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $Id: vm_pager.c,v 1.44 1999/03/14 09:20:00 julian Exp $
+ * $Id: vm_pager.c,v 1.45 1999/04/11 02:16:27 eivind Exp $
*/
/*
@@ -523,6 +523,9 @@ vm_pager_chain_iodone(struct buf *nbp)
* Obtain a physical buffer and chain it to its parent buffer. When
* I/O completes, the parent buffer will be B_SIGNAL'd. Errors are
* automatically propogated to the parent
+ *
+ * Since these are brand new buffers, we do not have to clear B_INVAL
+ * and B_ERROR because they are already clear.
*/
struct buf *
diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h
index 82b6574..aff14ab 100644
--- a/sys/vm/vm_pager.h
+++ b/sys/vm/vm_pager.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vm_pager.h 8.4 (Berkeley) 1/12/94
- * $Id: vm_pager.h,v 1.20 1999/01/24 02:32:15 dillon Exp $
+ * $Id: vm_pager.h,v 1.21 1999/03/14 09:20:00 julian Exp $
*/
/*
@@ -110,6 +110,14 @@ void flushchainbuf(struct buf *nbp);
void waitchainbuf(struct buf *bp, int count, int done);
void autochaindone(struct buf *bp);
+/*
+ * vm_page_get_pages:
+ *
+ * Retrieve pages from the VM system in order to map them into an object
+ * ( or into VM space somewhere ). If the pagein was successful, we
+ * must fully validate it.
+ */
+
static __inline int
vm_pager_get_pages(
vm_object_t object,
@@ -117,7 +125,13 @@ vm_pager_get_pages(
int count,
int reqpage
) {
- return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
+ int r;
+
+ r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
+ if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
+ vm_page_zero_invalid(m[reqpage], TRUE);
+ }
+ return(r);
}
static __inline void
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 628bec7..83f379a 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -38,7 +38,7 @@
* SUCH DAMAGE.
*
* from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
- * $Id: vnode_pager.c,v 1.106 1999/04/05 19:38:29 julian Exp $
+ * $Id: vnode_pager.c,v 1.107 1999/04/10 20:52:11 dt Exp $
*/
/*
@@ -789,7 +789,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
* read.
*/
vm_page_set_validclean(mt, 0, size - tfoff);
- vm_page_zero_invalid(mt, FALSE);
+ /* handled by vm_fault now */
+ /* vm_page_zero_invalid(mt, FALSE); */
}
vm_page_flag_clear(mt, PG_ZERO);
OpenPOWER on IntegriCloud