30 files changed, 1414 insertions, 998 deletions
diff --git a/sys/gnu/ext2fs/ext2_bmap.c b/sys/gnu/ext2fs/ext2_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/gnu/ext2fs/ext2_bmap.c
+++ b/sys/gnu/ext2fs/ext2_bmap.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
diff --git a/sys/gnu/fs/ext2fs/ext2_bmap.c b/sys/gnu/fs/ext2fs/ext2_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/gnu/fs/ext2fs/ext2_bmap.c
+++ b/sys/gnu/fs/ext2fs/ext2_bmap.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 803aab1..cb18320 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.206 1999/04/14 18:51:52 dt Exp $
+ * $Id: vfs_bio.c,v 1.207 1999/04/29 18:15:25 alc Exp $
  */
 
 /*
@@ -74,9 +74,6 @@ static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
-static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
-			      vm_offset_t off, vm_offset_t size,
-			      vm_page_t m);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
 			       int pageno, vm_page_t m);
 static void vfs_clean_pages(struct buf * bp);
@@ -222,6 +219,27 @@ bufcountwakeup(void)
 }
 
 /*
+ *	vfs_buf_test_cache:
+ *
+ *	Called when a buffer is extended.  This function clears the B_CACHE
+ *	bit if the newly extended portion of the buffer does not contain
+ *	valid data.
+ */
+static __inline__
+void
+vfs_buf_test_cache(struct buf *bp,
+		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+		  vm_page_t m)
+{
+	if (bp->b_flags & B_CACHE) {
+		int base = (foff + off) & PAGE_MASK;
+		if (vm_page_is_valid(m, base, size) == 0)
+			bp->b_flags &= ~B_CACHE;
+	}
+}
+
+
+/*
  * Initialize buffer headers and related structures.
  */
 void
@@ -371,7 +389,10 @@ bremfree(struct buf * bp)
 
 
 /*
- * Get a buffer with the specified data.  Look in the cache first.
+ * Get a buffer with the specified data.  Look in the cache first.  We
+ * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything ( see
+ * getblk() ).
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
@@ -388,7 +409,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
 			curproc->p_stats->p_ru.ru_inblock++;
 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
 		bp->b_flags |= B_READ;
-		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		bp->b_flags &= ~(B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
@@ -403,7 +424,9 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.
+ * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
+ * to initiating I/O . If B_CACHE is set, the buffer is valid 
+ * and we do not have to do anything.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
@@ -421,7 +444,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_flags |= B_READ;
-		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		bp->b_flags &= ~(B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
@@ -441,7 +464,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
 			if (curproc != NULL)
 				curproc->p_stats->p_ru.ru_inblock++;
 			rabp->b_flags |= B_READ | B_ASYNC;
-			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+			rabp->b_flags &= ~(B_ERROR | B_INVAL);
 			if (rabp->b_rcred == NOCRED) {
 				if (cred != NOCRED)
 					crhold(cred);
@@ -462,7 +485,14 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
 
 /*
  * Write, release buffer on completion.  (Done by iodone
- * if async.)
+ * if async).  Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable.  This is true even of NFS
+ * now so we set it generally.  This could be set either here 
+ * or in biodone() since the I/O is synchronous.  We put it
+ * here.
  */
 int
 bwrite(struct buf * bp)
@@ -486,7 +516,7 @@ bwrite(struct buf * bp)
 	bundirty(bp);
 
 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
-	bp->b_flags |= B_WRITEINPROG;
+	bp->b_flags |= B_WRITEINPROG | B_CACHE;
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
@@ -505,11 +535,12 @@ bwrite(struct buf * bp)
 			mp = vp->v_specmountpoint;
 		else
 			mp = vp->v_mount;
-		if (mp != NULL)
+		if (mp != NULL) {
 			if ((oldflags & B_ASYNC) == 0)
 				mp->mnt_stat.f_syncwrites++;
 			else
 				mp->mnt_stat.f_asyncwrites++;
+		}
 	}
 
 	if ((oldflags & B_ASYNC) == 0) {
@@ -522,7 +553,13 @@ bwrite(struct buf * bp)
 }
 
 /*
- * Delayed write. (Buffer is marked dirty).
+ * Delayed write. (Buffer is marked dirty).  Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
  */
 void
 bdwrite(struct buf * bp)
@@ -542,6 +579,12 @@ bdwrite(struct buf * bp)
 	bdirty(bp);
 
 	/*
+	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
+	 * true even of NFS now.
+	 */
+	bp->b_flags |= B_CACHE;
+
+	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
@@ -592,8 +635,11 @@ bdwrite(struct buf * bp)
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
- *	clears B_DONE ( else a panic will occur later ).  Note that B_INVALID 
- *	buffers are not considered dirty even if B_DELWRI is set.	
+ *	clears B_DONE ( else a panic will occur later ).  
+ *
+ *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
+ *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
@@ -645,6 +691,9 @@ bundirty(bp)
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
+ *
+ *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
+ *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf * bp)
@@ -658,7 +707,8 @@ bawrite(struct buf * bp)
  *
  *	Ordered write.  Start output on a buffer, and flag it so that the 
  *	device will write it in the order it was queued.  The buffer is 
- *	released when the output completes.
+ *	released when the output completes.  bwrite() ( or the VOP routine
+ *	anyway ) is responsible for handling B_INVAL buffers.
  */
 int
 bowrite(struct buf * bp)
@@ -694,10 +744,19 @@ brelse(struct buf * bp)
 		bp->b_flags &= ~B_ERROR;
 
 	if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) {
+		/*
+		 * Failed write, redirty.  Must clear B_ERROR to prevent
+		 * pages from being scrapped.  Note: B_INVAL is ignored
+		 * here but will presumably be dealt with later.
+		 */
 		bp->b_flags &= ~B_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
 	    (bp->b_bufsize <= 0)) {
+		/*
+		 * Either a failed I/O or we were asked to free or not
+		 * cache the buffer.
+		 */
 		bp->b_flags |= B_INVAL;
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
@@ -727,31 +786,22 @@ brelse(struct buf * bp)
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
-	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
-	 * but the VM object is kept around.  The B_NOCACHE flag is used to
-	 * invalidate the pages in the VM object.
+	 * constituted, not even NFS buffers now.  Two flags effect this.  If
+	 * B_INVAL, the struct buf is invalidated but the VM object is kept
+	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
-	 * The b_{validoff,validend,dirtyoff,dirtyend} values are relative 
-	 * to b_offset and currently have byte granularity, whereas the
-	 * valid flags in the vm_pages have only DEV_BSIZE resolution.
-	 * The byte resolution fields are used to avoid unnecessary re-reads
-	 * of the buffer but the code really needs to be genericized so
-	 * other filesystem modules can take advantage of these fields.
+	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
+	 * invalidated.  B_ERROR cannot be set for a failed write unless the
+	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
-	 * XXX this seems to cause performance problems.
+	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
+	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
+	 * the commit state and we cannot afford to lose the buffer.
 	 */
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 bp->b_vp->v_type != VBLK &&
-		 (bp->b_flags & B_DELWRI) != 0)
-#ifdef notdef
-	    && (bp->b_vp->v_tag != VT_NFS
-		|| bp->b_vp->v_type == VBLK
-		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
-		|| bp->b_validend == 0
-		|| (bp->b_validoff == 0
-		    && bp->b_validend == bp->b_bufsize))
-#endif
+		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
@@ -912,6 +962,11 @@ brelse(struct buf * bp)
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion.  It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
  */
 void
 bqrelse(struct buf * bp)
@@ -1096,6 +1151,8 @@ vfs_bio_awrite(struct buf * bp)
 	splx(s);
 	/*
 	 * default (old) behavior, writing out only one block
+	 *
+	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) VOP_BWRITE(bp);
@@ -1107,7 +1164,11 @@ vfs_bio_awrite(struct buf * bp)
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers 
- *	in the bufqueues as necessary.
+ *	in the bufqueues as necessary.  The new buffer is returned with
+ *	flags set to B_BUSY.
+ *
+ *	Important:  B_INVAL is not set.  If the caller wishes to throw the
+ *	buffer away, the caller must set B_INVAL prior to calling brelse().
  *
  *	We block if:
  *		We have insufficient buffer headers
@@ -1368,7 +1429,6 @@ restart:
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
-		bp->b_validoff = bp->b_validend = 0;
 		bp->b_usecount = 5;
 
 		LIST_INIT(&bp->b_dep);
@@ -1465,7 +1525,10 @@ dosleep:
 		}
 		bp->b_data = bp->b_kvabase;
 	}
-	
+
+	/*
+	 * The bp, if valid, is set to B_BUSY.
+	 */
 	return (bp);
 }
 
@@ -1546,9 +1609,10 @@ flushbufqueues(void)
 		}
 
 		/*
-		 * XXX NFS does weird things with B_INVAL bps if we bwrite
-		 * them ( vfs_bio_awrite/bawrite/bdwrite/etc )  Why?
-		 *
+		 * Try to free up B_INVAL delayed-write buffers rather then
+		 * writing them out.  Note also that NFS is somewhat sensitive
+		 * to B_INVAL buffers so it is doubly important that we do 
+		 * this.
 		 */
 		if ((bp->b_flags & B_DELWRI) != 0) {
 			if (bp->b_flags & B_INVAL) {
@@ -1622,20 +1686,28 @@ inmem(struct vnode * vp, daddr_t blkno)
 }
 
 /*
- * now we set the dirty range for the buffer --
- * for NFS -- if the file is mapped and pages have
- * been written to, let it know.  We want the
- * entire range of the buffer to be marked dirty if
- * any of the pages have been written to for consistancy
- * with the b_validoff, b_validend set in the nfs write
- * code, and used by the nfs read code.
+ *	vfs_setdirty:
+ *
+ *	Sets the dirty range for a buffer based on the status of the dirty
+ *	bits in the pages comprising the buffer.
+ *
+ *	The range is limited to the size of the buffer.
+ *
+ *	This routine is primarily used by NFS, but is generalized for the
+ *	B_VMIO case.
  */
 static void
 vfs_setdirty(struct buf *bp) 
 {
 	int i;
 	vm_object_t object;
-	vm_offset_t boffset;
+
+	/*
+	 * Degenerate case - empty buffer
+	 */
+
+	if (bp->b_bufsize == 0)
+		return;
 
 	/*
 	 * We qualify the scan for modified pages on whether the
@@ -1654,6 +1726,9 @@ vfs_setdirty(struct buf *bp)
 		printf("Warning: object %p mightbedirty but not writeable\n", object);
 
 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
+		vm_offset_t boffset;
+		vm_offset_t eoffset;
+
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
@@ -1664,47 +1739,85 @@ vfs_setdirty(struct buf *bp)
 		}
 
 		/*
-		 * scan forwards for the first page modified
+		 * Calculate the encompassing dirty range, boffset and eoffset,
+		 * (eoffset - boffset) bytes.
 		 */
+
 		for (i = 0; i < bp->b_npages; i++) {
-			if (bp->b_pages[i]->dirty) {
+			if (bp->b_pages[i]->dirty)
 				break;
-			}
 		}
-
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
-		if (boffset < bp->b_dirtyoff) {
-			bp->b_dirtyoff = max(boffset, 0);
-		}
 
-		/*
-		 * scan backwards for the last page modified
-		 */
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
-		boffset = (i + 1);
-#if 0
-		offset = boffset + bp->b_pages[0]->pindex;
-		if (offset >= object->size)
-			boffset = object->size - bp->b_pages[0]->pindex;
-#endif
-		boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
-		if (bp->b_dirtyend < boffset)
-			bp->b_dirtyend = min(boffset, bp->b_bufsize);
+		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		/*
+		 * Fit it to the buffer.
+		 */
+
+		if (eoffset > bp->b_bcount)
+			eoffset = bp->b_bcount;
+
+		/*
+		 * If we have a good dirty range, merge with the existing
+		 * dirty range.
+		 */
+
+		if (boffset < eoffset) {
+			if (bp->b_dirtyoff > boffset)
+				bp->b_dirtyoff = boffset;
+			if (bp->b_dirtyend < eoffset)
+				bp->b_dirtyend = eoffset;
+		}
 	}
 }
 
 /*
- * Get a block given a specified block and offset into a file/device.
+ *	getblk:
+ *
+ *	Get a block given a specified block and offset into a file/device.
+ *	The buffers B_DONE bit will be cleared on return, making it almost
+ * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
+ *	return.  The caller should clear B_INVAL prior to initiating a
+ *	READ.
+ *
+ *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ *	an existing buffer.
+ *
+ *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ *	and then cleared based on the backing VM.  If the previous buffer is
+ *	non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ *	If getblk() must create a new buffer, the new buffer is returned with
+ *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ *	case it is returned with B_INVAL clear and B_CACHE set based on the
+ *	backing VM.
+ *
+ *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
+ *	B_CACHE bit is clear.
+ *	
+ *	What this means, basically, is that the caller should use B_CACHE to
+ *	determine whether the buffer is fully valid or not and should clear
+ *	B_INVAL prior to issuing a read.  If the caller intends to validate
+ *	the buffer by loading its data area with something, the caller needs
+ *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
+ *	the caller should set B_CACHE ( as an optimization ), else the caller
+ *	should issue the I/O and biodone() will set B_CACHE if the I/O was
+ *	a write attempt or if it was a successfull read.  If the caller 
+ *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
+ *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
-	int i, s;
+	int s;
 	struct bufhashhdr *bh;
 
 #if !defined(MAX_PERF)
@@ -1727,6 +1840,10 @@ loop:
 	}
 
 	if ((bp = gbincore(vp, blkno))) {
+		/*
+		 * Buffer is in-core
+		 */
+
 		if (bp->b_flags & B_BUSY) {
 			bp->b_flags |= B_WANTED;
 			if (bp->b_usecount < BUF_MAXUSE)
@@ -1740,7 +1857,18 @@ loop:
 			splx(s);
 			return (struct buf *) NULL;
 		}
-		bp->b_flags |= B_BUSY | B_CACHE;
+
+		/*
+		 * Busy the buffer.  B_CACHE is cleared if the buffer is 
+		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
+		 * and for a VMIO buffer B_CACHE is adjusted according to the
+		 * backing VM cache.
+		 */
+		bp->b_flags |= B_BUSY;
+		if (bp->b_flags & B_INVAL)
+			bp->b_flags &= ~B_CACHE;
+		else if ((bp->b_flags & (B_VMIO|B_INVAL)) == 0)
+			bp->b_flags |= B_CACHE;
 		bremfree(bp);
 
 		/*
@@ -1770,7 +1898,9 @@ loop:
 
 		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
-		 * the buffer.  This might lead to B_CACHE getting cleared.
+		 * the buffer.  This might lead to B_CACHE getting set or
+		 * cleared.  If the size has not changed, B_CACHE remains
+		 * unchanged from its previous state.
 		 */
 
 		if (bp->b_bcount != size)
@@ -1780,45 +1910,19 @@ loop:
 		    ("getblk: no buffer offset"));
 
 		/*
-		 * Check that the constituted buffer really deserves for the
-		 * B_CACHE bit to be set.  B_VMIO type buffers might not
-		 * contain fully valid pages.  Normal (old-style) buffers
-		 * should be fully valid.  This might also lead to B_CACHE
-		 * getting clear.
+		 * A buffer with B_DELWRI set and B_CACHE clear must
+		 * be committed before we can return the buffer in
+		 * order to prevent the caller from issuing a read
+		 * ( due to B_CACHE not being set ) and overwriting
+		 * it.
 		 *
-		 * If B_CACHE is already clear, don't bother checking to see 
-		 * if we have to clear it again.
-		 *
-		 * XXX this code should not be necessary unless the B_CACHE
-		 * handling is broken elsewhere in the kernel.  We need to
-		 * check the cases and then turn the clearing part of this
-		 * code into a panic.
-		 */
-		if (
-		    (bp->b_flags & (B_VMIO|B_CACHE)) == (B_VMIO|B_CACHE) &&
-		    (bp->b_vp->v_tag != VT_NFS || bp->b_validend <= 0)
-		) {
-			int checksize = bp->b_bufsize;
-			int poffset = bp->b_offset & PAGE_MASK;
-			int resid;
-			for (i = 0; i < bp->b_npages; i++) {
-				resid = (checksize > (PAGE_SIZE - poffset)) ?
-					(PAGE_SIZE - poffset) : checksize;
-				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
-					bp->b_flags &= ~(B_CACHE | B_DONE);
-					break;
-				}
-				checksize -= resid;
-				poffset = 0;
-			}
-		}
-
-		/*
-		 * If B_DELWRI is set and B_CACHE got cleared ( or was
-		 * already clear ), we have to commit the write and
-		 * retry.  The NFS code absolutely depends on this,
-		 * and so might the FFS code.  In anycase, it formalizes
-		 * the B_CACHE rules.  See sys/buf.h.
+		 * Most callers, including NFS and FFS, need this to
+		 * operate properly either because they assume they
+		 * can issue a read if B_CACHE is not set, or because
+		 * ( for example ) an uncached B_DELWRI might loop due 
+		 * to softupdates re-dirtying the buffer.  In the latter
+		 * case, B_CACHE is set after the first write completes,
+		 * preventing further loops.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
@@ -1829,8 +1933,14 @@ loop:
 		if (bp->b_usecount < BUF_MAXUSE)
 			++bp->b_usecount;
 		splx(s);
-		return (bp);
+		bp->b_flags &= ~B_DONE;
 	} else {
+		/*
+		 * Buffer is not in-core, create new buffer.  The buffer
+		 * returned by getnewbuf() is marked B_BUSY.  Note that the
+		 * returned buffer is also considered valid ( not marked
+		 * B_INVAL ).
+		 */
 		int bsize, maxsize, vmio;
 		off_t offset;
 
@@ -1849,7 +1959,7 @@ loop:
 		maxsize = imax(maxsize, bsize);
 
 		if ((bp = getnewbuf(vp, blkno,
-			slpflag, slptimeo, size, maxsize)) == 0) {
+			slpflag, slptimeo, size, maxsize)) == NULL) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
@@ -1861,6 +1971,10 @@ loop:
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
+		 * If the buffer is created out from under us, we have to
+		 * throw away the one we just created.  There is now window
+		 * race because we are safely running at splbio() from the
+		 * point of the duplicate buffer creation through to here.
 		 */
 		if (gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
@@ -1880,8 +1994,15 @@ loop:
 		bh = BUFHASH(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
+		/*
+		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
+		 * buffer size starts out as 0, B_CACHE will be set by
+		 * allocbuf() for the VMIO case prior to it testing the
+		 * backing store for validity.
+		 */
+
 		if (vmio) {
-			bp->b_flags |= (B_VMIO | B_CACHE);
+			bp->b_flags |= B_VMIO;
 #if defined(VFS_BIO_DEBUG)
 			if (vp->v_type != VREG && vp->v_type != VBLK)
 				printf("getblk: vmioing file type %d???\n", vp->v_type);
@@ -1893,12 +2014,14 @@ loop:
 		allocbuf(bp, size);
 
 		splx(s);
-		return (bp);
+		bp->b_flags &= ~B_DONE;
 	}
+	return (bp);
 }
 
 /*
- * Get an empty, disassociated buffer of given size.
+ * Get an empty, disassociated buffer of given size.  The buffer is initially
+ * set to B_INVAL.
  */
 struct buf *
 geteblk(int size)
@@ -1910,7 +2033,7 @@ geteblk(int size)
 	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
 	splx(s);
 	allocbuf(bp, size);
-	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	return (bp);
 }
 
@@ -1925,6 +2048,9 @@ geteblk(int size)
  * deadlock or inconsistant data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
+ * B_CACHE for the non-VMIO case.
  */
 
 int
@@ -1945,7 +2071,8 @@ allocbuf(struct buf *bp, int size)
 		caddr_t origbuf;
 		int origbufsize;
 		/*
-		 * Just get anonymous memory from the kernel
+		 * Just get anonymous memory from the kernel.  Don't
+		 * mess with B_CACHE.
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 #if !defined(NO_B_MALLOC)
@@ -2046,13 +2173,25 @@ allocbuf(struct buf *bp, int size)
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 #endif
+		/*
+		 * Set B_CACHE initially if buffer is 0 length or will become
+		 * 0-length.
+		 */
+		if (size == 0 || bp->b_bufsize == 0)
+			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize) {
+			/*
+			 * DEV_BSIZE aligned new buffer size is less then the
+			 * DEV_BSIZE aligned existing buffer size.  Figure out
+			 * if we have to remove any pages.
+			 */
 			if (desiredpages < bp->b_npages) {
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
-					 * is the responsibility of vnode_pager_setsize
+					 * is the responsibility of 
+					 * vnode_pager_setsize
 					 */
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
@@ -2067,115 +2206,131 @@ allocbuf(struct buf *bp, int size)
 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
-		} else if (newbsize > bp->b_bufsize) {
-			vm_object_t obj;
-			vm_offset_t tinc, toff;
-			vm_ooffset_t off;
-			vm_pindex_t objoff;
-			int pageindex, curbpnpages;
+		} else if (size > bp->b_bcount) {
+			/*
+			 * We are growing the buffer, possibly in a 
+			 * byte-granular fashion.
+			 */
 			struct vnode *vp;
-			int bsize;
-			int orig_validoff = bp->b_validoff;
-			int orig_validend = bp->b_validend;
-
-			vp = bp->b_vp;
-
-			if (vp->v_type == VBLK)
-				bsize = DEV_BSIZE;
-			else
-				bsize = vp->v_mount->mnt_stat.f_iosize;
-
-			if (bp->b_npages < desiredpages) {
-				obj = vp->v_object;
-				tinc = PAGE_SIZE;
+			vm_object_t obj;
+			vm_offset_t toff;
+			vm_offset_t tinc;
 
-				off = bp->b_offset;
-				KASSERT(bp->b_offset != NOOFFSET,
-				    ("allocbuf: no buffer offset"));
-				curbpnpages = bp->b_npages;
-		doretry:
-				bp->b_validoff = orig_validoff;
-				bp->b_validend = orig_validend;
-				bp->b_flags |= B_CACHE;
-				for (toff = 0; toff < newbsize; toff += tinc) {
-					objoff = OFF_TO_IDX(off + toff);
-					pageindex = objoff - OFF_TO_IDX(off);
-					tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
-					if (pageindex < curbpnpages) {
-
-						m = bp->b_pages[pageindex];
-#ifdef VFS_BIO_DIAG
-						if (m->pindex != objoff)
-							panic("allocbuf: page changed offset?!!!?");
-#endif
-						if (tinc > (newbsize - toff))
-							tinc = newbsize - toff;
-						if (bp->b_flags & B_CACHE)
-							vfs_buf_set_valid(bp, off, toff, tinc, m);
-						continue;
-					}
-					m = vm_page_lookup(obj, objoff);
-					if (!m) {
-						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
-						if (!m) {
-							VM_WAIT;
-							vm_pageout_deficit += (desiredpages - curbpnpages);
-							goto doretry;
-						}
+			/*
+			 * Step 1, bring in the VM pages from the object, 
+			 * allocating them if necessary.  We must clear
+			 * B_CACHE if these pages are not valid for the 
+			 * range covered by the buffer.
+			 */
 
+			vp = bp->b_vp;
+			obj = vp->v_object;
+
+			while (bp->b_npages < desiredpages) {
+				vm_page_t m;
+				vm_pindex_t pi;
+
+				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
+				if ((m = vm_page_lookup(obj, pi)) == NULL) {
+					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
+					if (m == NULL) {
+						VM_WAIT;
+						vm_pageout_deficit += desiredpages - bp->b_npages;
+					} else {
 						vm_page_wire(m);
 						vm_page_wakeup(m);
 						bp->b_flags &= ~B_CACHE;
-
-					} else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) {
-						/*
-						 *  If we had to sleep, retry.
-						 *
-						 *  Also note that we only test
-						 *  PG_BUSY here, not m->busy.
-						 *  
-						 *  We cannot sleep on m->busy
-						 *  here because a vm_fault ->
-						 *  getpages -> cluster-read ->
-						 *  ...-> allocbuf sequence 
-						 *  will convert PG_BUSY to
-						 *  m->busy so we have to let 
-						 *  m->busy through if we do 
-						 *  not want to deadlock.
-						 */
-						goto doretry;
-					} else {
-						if ((curproc != pageproc) &&
-							((m->queue - m->pc) == PQ_CACHE) &&
-						    ((cnt.v_free_count + cnt.v_cache_count) <
-								(cnt.v_free_min + cnt.v_cache_min))) {
-							pagedaemon_wakeup();
-						}
-						if (tinc > (newbsize - toff))
-							tinc = newbsize - toff;
-						if (bp->b_flags & B_CACHE)
-							vfs_buf_set_valid(bp, off, toff, tinc, m);
-						vm_page_flag_clear(m, PG_ZERO);
-						vm_page_wire(m);
+						bp->b_pages[bp->b_npages] = m;
+						++bp->b_npages;
 					}
-					bp->b_pages[pageindex] = m;
-					curbpnpages = pageindex + 1;
+					continue;
 				}
-				if (vp->v_tag == VT_NFS && 
-				    vp->v_type != VBLK) {
-					if (bp->b_dirtyend > 0) {
-						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
-					}
-					if (bp->b_validend == 0)
-						bp->b_flags &= ~B_CACHE;
+
+				/*
+				 * We found a page.  If we have to sleep on it,
+				 * retry because it might have gotten freed out
+				 * from under us.
+				 *
+				 * We can only test PG_BUSY here.  Blocking on
+				 * m->busy might lead to a deadlock:
+				 *
+				 *  vm_fault->getpages->cluster_read->allocbuf
+				 *
+				 */
+
+				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
+					continue;
+
+				/*
+				 * We have a good page.  Should we wakeup the
+				 * page daemon?
+				 */
+				if ((curproc != pageproc) &&
+				    ((m->queue - m->pc) == PQ_CACHE) &&
+				    ((cnt.v_free_count + cnt.v_cache_count) <
+					(cnt.v_free_min + cnt.v_cache_min))
+				) {
+					pagedaemon_wakeup();
 				}
-				bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
-				bp->b_npages = curbpnpages;
-				pmap_qenter((vm_offset_t) bp->b_data,
-					bp->b_pages, bp->b_npages);
-				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
+				vm_page_flag_clear(m, PG_ZERO);
+				vm_page_wire(m);
+				bp->b_pages[bp->b_npages] = m;
+				++bp->b_npages;
 			}
+
+			/*
+			 * Step 2.  We've loaded the pages into the buffer,
+			 * we have to figure out if we can still have B_CACHE
+			 * set.  Note that B_CACHE is set according to the
+			 * byte-granular range ( bcount and size ), new the
+			 * aligned range ( newbsize ).
+			 *
+			 * The VM test is against m->valid, which is DEV_BSIZE
+			 * aligned.  Needless to say, the validity of the data
+			 * needs to also be DEV_BSIZE aligned.  Note that this
+			 * fails with NFS if the server or some other client
+			 * extends the file's EOF.  If our buffer is resized, 
+			 * B_CACHE may remain set! XXX
+			 */
+
+			toff = bp->b_bcount;
+			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+
+			while ((bp->b_flags & B_CACHE) && toff < size) {
+				vm_pindex_t pi;
+
+				if (tinc > (size - toff))
+					tinc = size - toff;
+
+				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
+				    PAGE_SHIFT;
+
+				vfs_buf_test_cache(
+				    bp, 
+				    bp->b_offset,
+				    toff, 
+				    tinc, 
+				    bp->b_pages[pi]
+				);
+				toff += tinc;
+				tinc = PAGE_SIZE;
+			}
+
+			/*
+			 * Step 3, fixup the KVM pmap.  Remember that
+			 * bp->b_data is relative to bp->b_offset, but 
+			 * bp->b_offset may be offset into the first page.
+			 */
+
+			bp->b_data = (caddr_t)
+			    trunc_page((vm_offset_t)bp->b_data);
+			pmap_qenter(
+			    (vm_offset_t)bp->b_data,
+			    bp->b_pages, 
+			    bp->b_npages
+			);
+			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
+			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
 	}
 	if (bp->b_flags & B_VMIO)
@@ -2184,13 +2339,17 @@ allocbuf(struct buf *bp, int size)
 	runningbufspace += (newbsize - bp->b_bufsize);
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
-	bp->b_bufsize = newbsize;
-	bp->b_bcount = size;
+	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
+	bp->b_bcount = size;		/* requested buffer size	*/
 	return 1;
 }
 
 /*
- * Wait for buffer I/O completion, returning error status.
+ *	biowait:
+ *
+ *	Wait for buffer I/O completion, returning error status.  The buffer
+ *	is left B_BUSY|B_DONE on return.  B_EINTR is converted into a EINTR
+ *	error and cleared.
  */
 int
 biowait(register struct buf * bp)
@@ -2220,9 +2379,23 @@ biowait(register struct buf * bp)
 }
 
 /*
- * Finish I/O on a buffer, calling an optional function.
- * This is usually called from interrupt level, so process blocking
- * is not *a good idea*.
+ *	biodone:
+ *
+ *	Finish I/O on a buffer, optionally calling a completion function.
+ *	This is usually called from an interrupt so process blocking is
+ *	not allowed.
+ *
+ *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
+ *	assuming B_INVAL is clear.
+ *
+ *	For the VMIO case, we set B_CACHE if the op was a read and no
+ *	read error occured, or if the op was a write.  B_CACHE is never
+ *	set if the buffer is invalid or otherwise uncacheable.
+ *
+ *	biodone does not mess with B_INVAL, allowing the I/O routine or the
+ *	initiator to leave B_INVAL set to brelse the buffer out of existance
+ *	in the biodone routine.
  */
 void
 biodone(register struct buf * bp)
@@ -2295,7 +2468,17 @@ biodone(register struct buf * bp)
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
-		iosize = bp->b_bufsize;
+
+		/*
+		 * Set B_CACHE if the op was a normal read and no error
+		 * occured.  B_CACHE is set for writes in the b*write()
+		 * routines.
+		 */
+		iosize = bp->b_bcount;
+		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
+			bp->b_flags |= B_CACHE;
+		}
+
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			m = bp->b_pages[i];
@@ -2307,6 +2490,7 @@ biodone(register struct buf * bp)
 					printf("biodone: page disappeared\n");
 #endif
 					vm_object_pip_subtract(obj, 1);
+					bp->b_flags &= ~B_CACHE;
 					continue;
 				}
 				bp->b_pages[i] = m;
@@ -2325,8 +2509,8 @@ biodone(register struct buf * bp)
 
 			/*
 			 * In the write case, the valid and clean bits are
-			 * already changed correctly, so we only need to do this
-			 * here in the read case.
+			 * already changed correctly ( see bdwrite() ), so we 
+			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 				vfs_page_set_valid(bp, foff, i, m);
@@ -2453,106 +2637,45 @@ vfs_unbusy_pages(struct buf * bp)
 }
 
 /*
- * Set NFS' b_validoff and b_validend fields from the valid bits
- * of a page.  If the consumer is not NFS, and the page is not
- * valid for the entire range, clear the B_CACHE flag to force
- * the consumer to re-read the page.
+ * vfs_page_set_valid:
  *
- * B_CACHE interaction is especially tricky.
- */
-static void
-vfs_buf_set_valid(struct buf *bp,
-		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
-		  vm_page_t m)
-{
-	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
-		vm_offset_t svalid, evalid;
-		int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
-
-		/*
-		 * This only bothers with the first valid range in the
-		 * page.
-		 */
-		svalid = off;
-		while (validbits && !(validbits & 1)) {
-			svalid += DEV_BSIZE;
-			validbits >>= 1;
-		}
-		evalid = svalid;
-		while (validbits & 1) {
-			evalid += DEV_BSIZE;
-			validbits >>= 1;
-		}
-		evalid = min(evalid, off + size);
-		/*
-		 * We can only set b_validoff/end if this range is contiguous
-		 * with the range built up already.  If we cannot set
-		 * b_validoff/end, we must clear B_CACHE to force an update
-		 * to clean the bp up.
-		 */
-		if (svalid == bp->b_validend) {
-			bp->b_validoff = min(bp->b_validoff, svalid);
-			bp->b_validend = max(bp->b_validend, evalid);
-		} else {
-			bp->b_flags &= ~B_CACHE;
-		}
-	} else if (!vm_page_is_valid(m,
-				     (vm_offset_t) ((foff + off) & PAGE_MASK),
-				     size)) {
-		bp->b_flags &= ~B_CACHE;
-	}
-}
-
-/*
- * Set the valid bits in a page, taking care of the b_validoff,
- * b_validend fields which NFS uses to optimise small reads.  Off is
- * the offset within the file and pageno is the page index within the buf.
+ *	Set the valid bits in a page based on the supplied offset.   The
+ *	range is restricted to the buffer's size.
  *
- * XXX we have to set the valid & clean bits for all page fragments 
- * touched by b_validoff/validend, even if the page fragment goes somewhat
- * beyond b_validoff/validend due to alignment.
+ *	For NFS, the range is additionally restricted to b_validoff/end.
+ *	validoff/end must be DEV_BSIZE chunky or the end must be at the 
+ *	file EOF.  If a dirty range exists, set the page's dirty bits
+ *	inclusively.
+ *
+ *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 {
-	struct vnode *vp = bp->b_vp;
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
-	 * page boundry or cross the end of the buffer.
+	 * page boundry or cross the end of the buffer.  The end of the
+	 * buffer, in this case, is our file EOF, not the allocation size
+	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
-	if (eoff > bp->b_offset + bp->b_bufsize)
-		eoff = bp->b_offset + bp->b_bufsize;
-
-	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
-		vm_ooffset_t sv, ev;
-		vm_page_set_invalid(m,
-		    (vm_offset_t) (soff & PAGE_MASK),
-		    (vm_offset_t) (eoff - soff));
-		/*
-		 * bp->b_validoff and bp->b_validend restrict the valid range
-		 * that we can set.  Note that these offsets are not DEV_BSIZE
-		 * aligned.  vm_page_set_validclean() must know what 
-		 * sub-DEV_BSIZE ranges to clear.
-		 */
-#if 0
-		sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
-		ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) & 
-		    ~(DEV_BSIZE - 1);
-#endif
-		sv = bp->b_offset + bp->b_validoff;
-		ev = bp->b_offset + bp->b_validend;
-		soff = qmax(sv, soff);
-		eoff = qmin(ev, eoff);
-	}
+	if (eoff > bp->b_offset + bp->b_bcount)
+		eoff = bp->b_offset + bp->b_bcount;
 
-	if (eoff > soff)
-		vm_page_set_validclean(m,
-	       (vm_offset_t) (soff & PAGE_MASK),
-	       (vm_offset_t) (eoff - soff));
+	/*
+	 * Set valid range.  This is typically the entire buffer and thus the
+	 * entire page.
+	 */
+	if (eoff > soff) {
+		vm_page_set_validclean(
+		    m,
+		   (vm_offset_t) (soff & PAGE_MASK),
+		   (vm_offset_t) (eoff - soff)
+		);
+	}
 }
 
 /*
@@ -2562,6 +2685,10 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
  * almost as being PG_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as B_ERROR or B_INVAL may be in an inconsistant state
+ * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf * bp, int clear_modify)
@@ -2595,6 +2722,22 @@ retry:
 				vm_page_io_start(m);
 			}
 
+			/*
+			 * When readying a buffer for a read ( i.e
+			 * clear_modify == 0 ), it is important to do
+			 * bogus_page replacement for valid pages in 
+			 * partially instantiated buffers.  Partially 
+			 * instantiated buffers can, in turn, occur when
+			 * reconstituting a buffer from its VM backing store
+			 * base.  We only have to do this if B_CACHE is
+			 * clear ( which causes the I/O to occur in the
+			 * first place ).  The replacement prevents the read
+			 * I/O from overwriting potentially dirty VM-backed
+			 * pages.  XXX bogus page replacement is, uh, bogus.
+			 * It may not work properly with small-block devices.
+			 * We need to find a better way.
+			 */
+
 			vm_page_protect(m, VM_PROT_NONE);
 			if (clear_modify)
 				vfs_page_set_valid(bp, foff, i, m);
@@ -2614,30 +2757,89 @@ retry:
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
  */
-void
+static void
 vfs_clean_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
+
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
+			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			vm_ooffset_t eoff = noff;
+
+			if (eoff > bp->b_offset + bp->b_bufsize)
+				eoff = bp->b_offset + bp->b_bufsize;
 			vfs_page_set_valid(bp, foff, i, m);
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+			foff = noff;
 		}
 	}
 }
 
+/*
+ *	vfs_bio_set_validclean:
+ *
+ *	Set the range within the buffer to valid and clean.  The range is 
+ *	relative to the beginning of the buffer, b_offset.  Note that b_offset
+ *	itself may be offset from the beginning of the first page.
+ */
+
+void   
+vfs_bio_set_validclean(struct buf *bp, int base, int size)
+{
+	if (bp->b_flags & B_VMIO) {
+		int i;
+		int n;
+
+		/*
+		 * Fixup base to be relative to beginning of first page.
+		 * Set initial n to be the maximum number of bytes in the
+		 * first page that can be validated.
+		 */
+
+		base += (bp->b_offset & PAGE_MASK);
+		n = PAGE_SIZE - (base & PAGE_MASK);
+
+		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+			vm_page_t m = bp->b_pages[i];
+
+			if (n > size)
+				n = size;
+
+			vm_page_set_validclean(m, base & PAGE_MASK, n);
+			base += n;
+			size -= n;
+			n = PAGE_SIZE;
+		}
+	}
+}
+
+/*
+ *	vfs_bio_clrbuf:
+ *
+ *	clear a buffer.  This routine essentially fakes an I/O, so we need
+ *	to clear B_ERROR and B_INVAL.
+ *
+ *	Note that while we only theoretically need to clear through b_bcount,
+ *	we go ahead and clear through b_bufsize.
+ */
+
 void
 vfs_bio_clrbuf(struct buf *bp) {
 	int i, mask = 0;
 	caddr_t sa, ea;
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+		bp->b_flags &= ~(B_INVAL|B_ERROR);
 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 		    (bp->b_offset & PAGE_MASK) == 0) {
 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index f7bd95e..5f7f870 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $
+ * $Id: vfs_cluster.c,v 1.80 1999/03/12 02:24:56 julian Exp $
  */
 
 #include "opt_debug_cluster.h"
@@ -251,6 +251,7 @@ single_block_read:
 #endif
 		if ((bp->b_flags & B_CLUSTER) == 0)
 			vfs_busy_pages(bp, 0);
+		bp->b_flags &= ~(B_ERROR|B_INVAL);
 		error = VOP_STRATEGY(vp, bp);
 		curproc->p_stats->p_ru.ru_inblock++;
 	}
@@ -283,6 +284,7 @@ single_block_read:
 
 			if ((rbp->b_flags & B_CLUSTER) == 0)
 				vfs_busy_pages(rbp, 0);
+			rbp->b_flags &= ~(B_ERROR|B_INVAL);
 			(void) VOP_STRATEGY(vp, rbp);
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
@@ -473,8 +475,10 @@ cluster_callback(bp)
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;
-		} else
-		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+		} else {
+			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+			tbp->b_flags &= ~(B_ERROR|B_INVAL);
+		}
 		biodone(tbp);
 	}
 	relpbuf(bp, &cluster_pbuf_freecnt);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index c0565a4..de5d18d 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -138,6 +138,18 @@ vop_panic(struct vop_generic_args *ap)
 	panic("illegal vnode op called");
 }
 
+/*
+ *	vop_nostrategy:
+ *
+ *	Strategy routine for VFS devices that have none.
+ *
+ *	B_ERROR and B_INVAL must be cleared prior to calling any strategy
+ *	routine.  Typically this is done for a B_READ strategy call.  Typically
+ *	B_INVAL is assumed to already be clear prior to a write and should not
+ *	be cleared manually unless you just made the buffer invalid.  B_ERROR
+ *	should be cleared either way.
+ */
+
 static int
 vop_nostrategy (struct vop_strategy_args *ap)
 {
diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfs/nfs.h
+++ b/sys/nfs/nfs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index cef982b..0d8a782 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
  */
 
 
@@ -65,7 +65,6 @@
 
 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
 extern int nfs_pbuf_freecnt;
@@ -84,7 +83,7 @@ nfs_getpages(ap)
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
-	int i, error, nextoff, size, toff, npages, count;
+	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
@@ -110,13 +109,35 @@ nfs_getpages(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	npages = btoc(count);
+
+	/*
+	 * If the requested page is partially valid, just return it and
+	 * allow the pager to zero-out the blanks.  Partially valid pages
+	 * can only occur at the file EOF.
+	 */
+
+	{
+		vm_page_t m = pages[ap->a_reqpage];
+
+		if (m->valid != 0) {
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
+			for (i = 0; i < npages; ++i) {
+				if (i != ap->a_reqpage)
+					vnode_pager_freepage(pages[i]);
+			}
+			return(0);
+		}
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
-	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 
@@ -167,12 +188,12 @@ nfs_getpages(ap)
 			m->dirty = 0;
 		} else if (size > toff) {
 			/*
-			 * Read operation filled a partial page, set valid
-			 * bits properly.  validclean will zero out
-			 * any cruft in the buffer when setting a valid bit,
-			 * if the size is not DEV_BSIZE aligned.
+			 * Read operation filled a partial page.
 			 */
+			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
 		}
 		
 		if (i != ap->a_reqpage) {
@@ -197,13 +218,6 @@ nfs_getpages(ap)
 			} else {
 				vnode_pager_freepage(m);
 			}
-		} else {
-			/*
-			 * This page is being mapped, clear out any other
-			 * cruft in the invalid areas of the page.
-			 */
-			if (m->valid && m->valid != VM_PAGE_BITS_ALL)
-				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	return 0;
@@ -228,14 +242,17 @@ nfs_putpages(ap)
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
+	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct nfsmount *nmp;
+	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
+	np = VTONFS(vp);
 	p = curproc;				/* XXX */
 	cred = curproc->p_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
@@ -243,6 +260,7 @@ nfs_putpages(ap)
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
+	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@@ -253,6 +271,16 @@ nfs_putpages(ap)
 	}
 
 	/*
+	 * When putting pages, do not extend file past EOF.
+	 */
+
+	if (offset + count > np->n_size) {
+		count = np->n_size - offset;
+		if (count < 0)
+			count = 0;
+	}
+
+	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
@@ -265,7 +293,7 @@ nfs_putpages(ap)
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
-	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
@@ -297,23 +325,21 @@ nfs_putpages(ap)
  * Vnode op for read using bio
  */
 int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
 	register struct vnode *vp;
 	register struct uio *uio;
 	int ioflag;
 	struct ucred *cred;
-	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register int biosize, i;
-	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
-	int bufsize;
-	int nra, error = 0, n = 0, on = 0, not_readin;
+	int bcount;
+	int nra, error = 0, n = 0, on = 0;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
-		not_readin = 1;
 
 		/*
 		 * Start the read ahead(s), as required.
@@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 				return (EINTR);
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		}
 
 		/*
-		 * If the block is in the cache and has the required data
-		 * in a valid region, just copy it out.
-		 * Otherwise, get the block and write back/read in,
-		 * as required.
+		 * Obtain the buffer cache block.  Figure out the buffer size
+		 * when we are at EOF.  nfs_getcacheblk() will also force
+		 * uncached delayed-writes to be flushed to the server.
+		 *
+		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
-again:
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size && 
-		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		bcount = biosize;
+		if ((off_t)lbn * biosize >= np->n_size) {
+			bcount = 0;
+		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+			bcount = np->n_size - (off_t)lbn * biosize;
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+		bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		if (!bp)
 			return (EINTR);
 
 		/*
-		 * If we are being called from nfs_getpages, we must
-		 * make sure the buffer is a vmio buffer.  The vp will
-		 * already be setup for vmio but there may be some old
-		 * non-vmio buffers attached to it.
+		 * If B_CACHE is not set, we must issue the read.  If this
+		 * fails, we return an error.
 		 */
-		if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
-			printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
-			bp->b_flags |= B_NOCACHE;
-			bp->b_flags |= B_INVAFTERWRITE;
-			if (bp->b_dirtyend > 0) {
-				if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-			} else
-				brelse(bp);
-			goto again;
-		}
+
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
-		    not_readin = 0;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -501,32 +509,20 @@ again:
 			return (error);
 		    }
 		}
-		if (bufsize > on) {
-			n = min((unsigned)(bufsize - on), uio->uio_resid);
-		} else {
-			n = 0;
-		}
-		diff = np->n_size - uio->uio_offset;
-		if (diff < n)
-			n = diff;
-		if (not_readin && n > 0) {
-			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
-				bp->b_flags |= B_NOCACHE;
-				bp->b_flags |= B_INVAFTERWRITE;
-				if (bp->b_dirtyend > 0) {
-				    if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				    if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-				} else
-				    brelse(bp);
-				goto again;
-			}
-		}
+
+		/*
+		 * on is the offset into the current bp.  Figure out how many
+		 * bytes we can copy out of the bp.  Note that bcount is
+		 * NOT DEV_BSIZE aligned.
+		 *
+		 * Then figure out how many bytes we can copy into the uio.
+		 */
+
+		n = 0;
+		if (on < bcount)
+			n = min((unsigned)(bcount - on), uio->uio_resid);
+
 		vp->v_lastr = lbn;
-		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
-		if (diff < n)
-			n = diff;
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
@@ -535,7 +531,6 @@ again:
 			return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -560,13 +555,13 @@ again:
 		    return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
+			printf("got bad cookie vp %p bp %p\n", vp, bp);
 			nfs_invaldir(vp);
 			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 			/*
@@ -574,6 +569,10 @@ again:
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
+			 *
+			 * Leave the last bp intact unless there is an error.
+			 * Loop back up to the while if the error is another
+			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
@@ -582,21 +581,32 @@ again:
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
 			    if (!bp)
 				return (EINTR);
-			    if ((bp->b_flags & B_DONE) == 0) {
-				bp->b_flags |= B_READ;
-				bp->b_flags &= ~B_DONE;
-				vfs_busy_pages(bp, 0);
-				error = nfs_doio(bp, cred, p);
-				if (error == 0 && (bp->b_flags & B_INVAL))
-					break;
-				if (error) {
-				    brelse(bp);
-				} else if (i < lbn) {
-				    brelse(bp);
-				}
+			    if ((bp->b_flags & B_CACHE) == 0) {
+				    bp->b_flags |= B_READ;
+				    vfs_busy_pages(bp, 0);
+				    error = nfs_doio(bp, cred, p);
+				    /*
+				     * no error + B_INVAL == directory EOF,
+				     * use the block.
+				     */
+				    if (error == 0 && (bp->b_flags & B_INVAL))
+					    break;
 			    }
+			    /*
+			     * An error will throw away the block and the
+			     * for loop will break out.  If no error and this
+			     * is not the block we want, we throw away the
+			     * block and go for the next one via the for loop.
+			     */
+			    if (error || i < lbn)
+				    brelse(bp);
 			}
 		    }
+		    /*
+		     * The above while is repeated if we hit another cookie
+		     * error.  If we hit an error and it wasn't a cookie error,
+		     * we give up.
+		     */
 		    if (error)
 			    return (error);
 		}
@@ -616,7 +626,6 @@ again:
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -629,10 +638,20 @@ again:
 			}
 		}
 		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
+		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+		 * chopped for the EOF condition, we cannot tell how large
+		 * NFS directories are going to be until we hit EOF.  So
+		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
+		 * it just so happens that b_resid will effectively chop it
+		 * to EOF.  *BUT* this information is lost if the buffer goes
+		 * away and is reconstituted into a B_CACHE state ( due to
+		 * being VMIO ) later.  So we keep track of the directory eof
+		 * in np->n_direofoffset and chop it off as an extra step 
+		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@@ -649,6 +668,10 @@ again:
 		n = 0;
 		break;
 	    case VDIR:
+		/*
+		 * Invalidate buffer if caching is disabled, forcing a
+		 * re-read from the remote later.
+		 */
 		if (np->n_flag & NQNFSNONCACHE)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -660,24 +683,6 @@ again:
 	return (error);
 }
 
-static void
-nfs_prot_buf(bp, off, n)
-	struct buf *bp;
-	int off;
-	int n;
-{
-	int pindex, boff, end;
-
-	if ((bp->b_flags & B_VMIO) == 0)
-		return;
-
-	end = round_page(off + n);
-	for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
-		pindex = boff >> PAGE_SHIFT;
-		vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
-	}
-}
-
 /*
  * Vnode op for write using bio
  */
@@ -690,18 +695,18 @@ nfs_write(ap)
 		struct ucred *a_cred;
 	} */ *ap;
 {
-	register int biosize;
-	register struct uio *uio = ap->a_uio;
+	int biosize;
+	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
-	register struct ucred *cred = ap->a_cred;
+	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
-	int bufsize;
+	int bcount;
 	int n, on, error = 0, iomode, must_commit;
 
 #ifdef DIAGNOSTIC
@@ -749,12 +754,9 @@ nfs_write(ap)
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
-	/*
-	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
-	 * will be the same size within a filesystem. nfs_writerpc will
-	 * still use nm_wsize when sizing the rpc's.
-	 */
+
 	biosize = vp->v_mount->mnt_stat.f_iosize;
+
 	do {
 		/*
 		 * Check for a valid write lease.
@@ -786,17 +788,74 @@ nfs_write(ap)
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
-		if (uio->uio_offset + n > np->n_size) {
+		/*
+		 * Handle direct append and file extension cases, calculate
+		 * unaligned buffer size.
+		 */
+
+		if (uio->uio_offset == np->n_size && n) {
+			/*
+			 * special append case.  Obtain buffer prior to
+			 * resizing it to maintain B_CACHE.
+			 */
+			long save;
+
+			bcount = on;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+			save = bp->b_flags & B_CACHE;
+
 			np->n_size = uio->uio_offset + n;
 			np->n_flag |= NMODIFIED;
 			vnode_pager_setsize(vp, np->n_size);
+
+			bcount += n;
+			allocbuf(bp, bcount);
+			bp->b_flags |= save;
+		} else {
+			if (uio->uio_offset + n > np->n_size) {
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
+			}
+			bcount = biosize;
+			if ((off_t)(lbn + 1) * biosize > np->n_size) 
+				bcount = np->n_size - (off_t)lbn * biosize;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+		}
+
+		/*
+		 * Issue a READ if B_CACHE is not set.  In special-append
+		 * mode, B_CACHE is based on the buffer prior to the write
+		 * op and is typically set, avoiding the read.  If a read
+		 * is required in special append mode, the server will
+		 * probably send us a short-read since we extended the file
+		 * on our end, resulting in b_resid == 0 and, thusly, 
+		 * B_CACHE getting set.
+		 *
+		 * We can also avoid issuing the read if the write covers
+		 * the entire buffer.  We have to make sure the buffer state
+		 * is reasonable in this case since we will not be initiating
+		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
+		 * more information.
+		 *
+		 * B_CACHE may also be set due to the buffer being cached
+		 * normally.
+		 */
+
+		if (on == 0 && n == bcount) {
+			bp->b_flags |= B_CACHE;
+			bp->b_flags &= ~(B_ERROR | B_INVAL);
 		}
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		if ((bp->b_flags & B_CACHE) == 0) {
+			bp->b_flags |= B_READ;
+			vfs_busy_pages(bp, 0);
+			error = nfs_doio(bp, cred, p);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
 		if (bp->b_wcred == NOCRED) {
@@ -820,6 +879,17 @@ again:
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
+		 *
+		 * While it is possible to merge discontiguous writes due to 
+		 * our having a B_CACHE buffer ( and thus valid read data
+		 * for the hole), we don't because it could lead to 
+		 * significant cache coherency problems with multiple clients,
+		 * especially if locking is implemented later on.
+		 *
+		 * as an optimization we could theoretically maintain
+		 * a linked list of discontinuous areas, but we would still
+		 * have to commit them separately so there isn't much
+		 * advantage to it except perhaps a bit of asynchronization.
 		 */
 
 		if (bp->b_dirtyend > 0 &&
@@ -863,11 +933,6 @@ again:
 		}
 
 		/*
-		 * This will keep the buffer and mmaped regions more coherent.
-		 */
-		nfs_prot_buf(bp, on, n);
-
-		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate 
 		 * condition.
 		 */
@@ -879,21 +944,7 @@ again:
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
-		}
-
-		/*
-		 * To avoid code complexity, we may have to throw away
-		 * previously valid ranges when merging the new dirty range
-		 * into the valid range.  As long as we do not *ADD* an
-		 * invalid valid range, we are ok.
-		 */
-		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
-		    bp->b_validoff > bp->b_dirtyend) {
-			bp->b_validoff = bp->b_dirtyoff;
-			bp->b_validend = bp->b_dirtyend;
-		} else {
-			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+			vfs_bio_set_validclean(bp, on, n);
 		}
 
 		/*
@@ -904,11 +955,14 @@ again:
 
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
+		 *
+		 * IO_INVAL appears to be unused.  The idea appears to be
+		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 			bp->b_proc = p;
 			if (ioflag & IO_INVAL)
-				bp->b_flags |= B_INVAL;
+				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp);
 			if (error)
 				return (error);
@@ -922,8 +976,9 @@ again:
 			bp->b_proc = (struct proc *)0;
 			bp->b_flags |= B_ASYNC;
 			(void)nfs_writebp(bp, 0);
-		} else
+		} else {
 			bdwrite(bp);
+		}
 	} while (uio->uio_resid > 0 && n > 0);
 	return (0);
 }
@@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
 				return ((struct buf *)0);
 			bp = getblk(vp, bn, size, 0, 2 * hz);
 		}
-	} else
+	} else {
 		bp = getblk(vp, bn, size, 0, 0);
+	}
 
 	if (vp->v_type == VREG) {
 		int biosize;
+
 		biosize = mp->mnt_stat.f_iosize;
 		bp->b_blkno = bn * (biosize / DEV_BSIZE);
 	}
-
 	return (bp);
 }
 
@@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
  */
 int
 nfs_asyncio(bp, cred)
@@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
 	struct vnode *vp;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
-	int error = 0, diff, len, iomode, must_commit = 0;
+	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 
@@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_procp = p;
 
+	/*
+	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
+	 * do this here so we do not have to do it in all the code that
+	 * calls us.
+	 */
+	bp->b_flags &= ~(B_ERROR | B_INVAL);
+
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
 
 	/*
@@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
 		if (!error) {
-		    bp->b_validoff = 0;
 		    if (uiop->uio_resid) {
 			/*
-			 * If len > 0, there is a hole in the file and
-			 * no writes after the hole have been pushed to
-			 * the server yet.
-			 * Just zero fill the rest of the valid area.
+			 * If we had a short read with no error, we must have
+			 * hit a file hole.  We should zero-fill the remainder.
+			 * This can also occur if the server hits the file EOF.
+			 *
+			 * Holes used to be able to occur due to pending 
+			 * writes, but that is not possible any longer.
 			 */
-			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
-				+ diff);
-			if (len > 0) {
-			    len = min(len, uiop->uio_resid);
-			    bzero((char *)bp->b_data + diff, len);
-			    bp->b_validend = diff + len;
-			} else
-			    bp->b_validend = diff;
-		    } else
-			bp->b_validend = bp->b_bcount;
+			int nread = bp->b_bcount - uiop->uio_resid;
+			int left  = bp->b_bcount - nread;
+
+			if (left > 0)
+				bzero((char *)bp->b_data + nread, left);
+			uiop->uio_resid = 0;
+		    }
 		}
 		if (p && (vp->v_flag & VTEXT) &&
 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
@@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
+		/*
+		 * end-of-directory sets B_INVAL but does not generate an
+		 * error.
+		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
-			&& bp->b_dirtyend == bp->b_bufsize)
+			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~B_NEEDCOMMIT;
diff --git a/sys/nfs/nfs_nqlease.c b/sys/nfs/nfs_nqlease.c
index 71f692a..e45c73f 100644
--- a/sys/nfs/nfs_nqlease.c
+++ b/sys/nfs/nfs_nqlease.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_nqlease.c	8.9 (Berkeley) 5/20/95
- * $Id: nfs_nqlease.c,v 1.39 1998/10/31 15:31:25 peter Exp $
+ * $Id: nfs_nqlease.c,v 1.40 1999/02/25 00:03:51 peter Exp $
  */
 
 
@@ -561,6 +561,10 @@ nqsrv_send_eviction(vp, lp, slp, nam, cred)
 				*mtod(m, u_int32_t *) = htonl(0x80000000 |
 					(m->m_pkthdr.len - NFSX_UNSIGNED));
 			}
+			/*
+			 * nfs_sndlock if PR_CONNREQUIRED XXX
+			 */
+
 			if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 &&
 			    (lph->lph_slp->ns_flag & SLP_VALID) == 0) ||
 			    (nfs_slplock(lph->lph_slp, 0) == 0))
diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c
index 1490f72..2267629 100644
--- a/sys/nfs/nfs_socket.c
+++ b/sys/nfs/nfs_socket.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
  */
 
 /*
@@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };
 
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
  * There is a congestion window for outstanding rpcs maintained per mount
  * point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }
 
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }
 
 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
  */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;
 
-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
-		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+	++nfs_realign_test;
 
-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
-		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
-			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
 			}
+			n->m_len = 0;
+			break;
 		}
+		pm = &m->m_next;
+	}
 
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
+		while (m) {
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
+			m = m->m_next;
 		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }
 
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index a92bb22..6114d56 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
  */
 
 
@@ -408,9 +408,9 @@ nfs_access(ap)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
-				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+				bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
-				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+				aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
@@ -962,7 +962,7 @@ nfs_read(ap)
 
 	if (vp->v_type != VREG)
 		return (EPERM);
-	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 }
 
 /*
@@ -980,7 +980,7 @@ nfs_readlink(ap)
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
-	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
@@ -1985,7 +1985,7 @@ nfs_readdir(ap)
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
-	error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid)
 		nfsstats.direofcache_misses++;
@@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)
 
 {
 	register int len, left;
-	register struct dirent *dp;
+	register struct dirent *dp = NULL;
 	register u_int32_t *tl;
 	register caddr_t cp;
 	register int32_t t1, t2;
@@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
 	int attrflag;
 	int v3 = NFS_ISV3(vp);
 
-#ifndef nolint
-	dp = (struct dirent *)0;
-#endif
 #ifndef DIAGNOSTIC
-	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
-		(uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif
 
@@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
 		m_freem(mrep);
 	}
 	/*
-	 * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
@@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
 		struct vnode *a_bp;
 	} */ *ap;
 {
-
 	return (nfs_writebp(ap->a_bp, 1));
 }
 
 /*
  * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag.  We set
+ * B_CACHE if this is a VMIO buffer.
  */
 int
 nfs_writebp(bp, force)
@@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
 	if(!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");
 
-	if (bp->b_flags & B_INVAL)
-		bp->b_flags |= B_NOCACHE;
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return(0);
+	}
+
+	bp->b_flags |= B_CACHE;
 
 	/*
-	 * XXX we bundirty() the bp here.  Shouldn't we do it later after
-	 * the I/O has completed??
+	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index cef982b..0d8a782 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
  */
 
 
@@ -65,7 +65,6 @@
 
 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
 extern int nfs_pbuf_freecnt;
@@ -84,7 +83,7 @@ nfs_getpages(ap)
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
-	int i, error, nextoff, size, toff, npages, count;
+	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
@@ -110,13 +109,35 @@ nfs_getpages(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	npages = btoc(count);
+
+	/*
+	 * If the requested page is partially valid, just return it and
+	 * allow the pager to zero-out the blanks.  Partially valid pages
+	 * can only occur at the file EOF.
+	 */
+
+	{
+		vm_page_t m = pages[ap->a_reqpage];
+
+		if (m->valid != 0) {
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
+			for (i = 0; i < npages; ++i) {
+				if (i != ap->a_reqpage)
+					vnode_pager_freepage(pages[i]);
+			}
+			return(0);
+		}
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
-	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 
@@ -167,12 +188,12 @@ nfs_getpages(ap)
 			m->dirty = 0;
 		} else if (size > toff) {
 			/*
-			 * Read operation filled a partial page, set valid
-			 * bits properly.  validclean will zero out
-			 * any cruft in the buffer when setting a valid bit,
-			 * if the size is not DEV_BSIZE aligned.
+			 * Read operation filled a partial page.
 			 */
+			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
 		}
 		
 		if (i != ap->a_reqpage) {
@@ -197,13 +218,6 @@ nfs_getpages(ap)
 			} else {
 				vnode_pager_freepage(m);
 			}
-		} else {
-			/*
-			 * This page is being mapped, clear out any other
-			 * cruft in the invalid areas of the page.
-			 */
-			if (m->valid && m->valid != VM_PAGE_BITS_ALL)
-				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	return 0;
@@ -228,14 +242,17 @@ nfs_putpages(ap)
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
+	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct nfsmount *nmp;
+	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
+	np = VTONFS(vp);
 	p = curproc;				/* XXX */
 	cred = curproc->p_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
@@ -243,6 +260,7 @@ nfs_putpages(ap)
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
+	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@@ -253,6 +271,16 @@ nfs_putpages(ap)
 	}
 
 	/*
+	 * When putting pages, do not extend file past EOF.
+	 */
+
+	if (offset + count > np->n_size) {
+		count = np->n_size - offset;
+		if (count < 0)
+			count = 0;
+	}
+
+	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
@@ -265,7 +293,7 @@ nfs_putpages(ap)
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
-	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
@@ -297,23 +325,21 @@ nfs_putpages(ap)
  * Vnode op for read using bio
  */
 int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
 	register struct vnode *vp;
 	register struct uio *uio;
 	int ioflag;
 	struct ucred *cred;
-	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register int biosize, i;
-	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
-	int bufsize;
-	int nra, error = 0, n = 0, on = 0, not_readin;
+	int bcount;
+	int nra, error = 0, n = 0, on = 0;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
-		not_readin = 1;
 
 		/*
 		 * Start the read ahead(s), as required.
@@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 				return (EINTR);
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		}
 
 		/*
-		 * If the block is in the cache and has the required data
-		 * in a valid region, just copy it out.
-		 * Otherwise, get the block and write back/read in,
-		 * as required.
+		 * Obtain the buffer cache block.  Figure out the buffer size
+		 * when we are at EOF.  nfs_getcacheblk() will also force
+		 * uncached delayed-writes to be flushed to the server.
+		 *
+		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
-again:
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size && 
-		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		bcount = biosize;
+		if ((off_t)lbn * biosize >= np->n_size) {
+			bcount = 0;
+		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+			bcount = np->n_size - (off_t)lbn * biosize;
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+		bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		if (!bp)
 			return (EINTR);
 
 		/*
-		 * If we are being called from nfs_getpages, we must
-		 * make sure the buffer is a vmio buffer.  The vp will
-		 * already be setup for vmio but there may be some old
-		 * non-vmio buffers attached to it.
+		 * If B_CACHE is not set, we must issue the read.  If this
+		 * fails, we return an error.
 		 */
-		if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
-			printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
-			bp->b_flags |= B_NOCACHE;
-			bp->b_flags |= B_INVAFTERWRITE;
-			if (bp->b_dirtyend > 0) {
-				if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-			} else
-				brelse(bp);
-			goto again;
-		}
+
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
-		    not_readin = 0;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -501,32 +509,20 @@ again:
 			return (error);
 		    }
 		}
-		if (bufsize > on) {
-			n = min((unsigned)(bufsize - on), uio->uio_resid);
-		} else {
-			n = 0;
-		}
-		diff = np->n_size - uio->uio_offset;
-		if (diff < n)
-			n = diff;
-		if (not_readin && n > 0) {
-			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
-				bp->b_flags |= B_NOCACHE;
-				bp->b_flags |= B_INVAFTERWRITE;
-				if (bp->b_dirtyend > 0) {
-				    if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				    if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-				} else
-				    brelse(bp);
-				goto again;
-			}
-		}
+
+		/*
+		 * on is the offset into the current bp.  Figure out how many
+		 * bytes we can copy out of the bp.  Note that bcount is
+		 * NOT DEV_BSIZE aligned.
+		 *
+		 * Then figure out how many bytes we can copy into the uio.
+		 */
+
+		n = 0;
+		if (on < bcount)
+			n = min((unsigned)(bcount - on), uio->uio_resid);
+
 		vp->v_lastr = lbn;
-		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
-		if (diff < n)
-			n = diff;
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
@@ -535,7 +531,6 @@ again:
 			return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -560,13 +555,13 @@ again:
 		    return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
+			printf("got bad cookie vp %p bp %p\n", vp, bp);
 			nfs_invaldir(vp);
 			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 			/*
@@ -574,6 +569,10 @@ again:
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
+			 *
+			 * Leave the last bp intact unless there is an error.
+			 * Loop back up to the while if the error is another
+			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
@@ -582,21 +581,32 @@ again:
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
 			    if (!bp)
 				return (EINTR);
-			    if ((bp->b_flags & B_DONE) == 0) {
-				bp->b_flags |= B_READ;
-				bp->b_flags &= ~B_DONE;
-				vfs_busy_pages(bp, 0);
-				error = nfs_doio(bp, cred, p);
-				if (error == 0 && (bp->b_flags & B_INVAL))
-					break;
-				if (error) {
-				    brelse(bp);
-				} else if (i < lbn) {
-				    brelse(bp);
-				}
+			    if ((bp->b_flags & B_CACHE) == 0) {
+				    bp->b_flags |= B_READ;
+				    vfs_busy_pages(bp, 0);
+				    error = nfs_doio(bp, cred, p);
+				    /*
+				     * no error + B_INVAL == directory EOF,
+				     * use the block.
+				     */
+				    if (error == 0 && (bp->b_flags & B_INVAL))
+					    break;
 			    }
+			    /*
+			     * An error will throw away the block and the
+			     * for loop will break out.  If no error and this
+			     * is not the block we want, we throw away the
+			     * block and go for the next one via the for loop.
+			     */
+			    if (error || i < lbn)
+				    brelse(bp);
 			}
 		    }
+		    /*
+		     * The above while is repeated if we hit another cookie
+		     * error.  If we hit an error and it wasn't a cookie error,
+		     * we give up.
+		     */
 		    if (error)
 			    return (error);
 		}
@@ -616,7 +626,6 @@ again:
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -629,10 +638,20 @@ again:
 			}
 		}
 		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
+		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+		 * chopped for the EOF condition, we cannot tell how large
+		 * NFS directories are going to be until we hit EOF.  So
+		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
+		 * it just so happens that b_resid will effectively chop it
+		 * to EOF.  *BUT* this information is lost if the buffer goes
+		 * away and is reconstituted into a B_CACHE state ( due to
+		 * being VMIO ) later.  So we keep track of the directory eof
+		 * in np->n_direofoffset and chop it off as an extra step 
+		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@@ -649,6 +668,10 @@ again:
 		n = 0;
 		break;
 	    case VDIR:
+		/*
+		 * Invalidate buffer if caching is disabled, forcing a
+		 * re-read from the remote later.
+		 */
 		if (np->n_flag & NQNFSNONCACHE)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -660,24 +683,6 @@ again:
 	return (error);
 }
 
-static void
-nfs_prot_buf(bp, off, n)
-	struct buf *bp;
-	int off;
-	int n;
-{
-	int pindex, boff, end;
-
-	if ((bp->b_flags & B_VMIO) == 0)
-		return;
-
-	end = round_page(off + n);
-	for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
-		pindex = boff >> PAGE_SHIFT;
-		vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
-	}
-}
-
 /*
  * Vnode op for write using bio
  */
@@ -690,18 +695,18 @@ nfs_write(ap)
 		struct ucred *a_cred;
 	} */ *ap;
 {
-	register int biosize;
-	register struct uio *uio = ap->a_uio;
+	int biosize;
+	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
-	register struct ucred *cred = ap->a_cred;
+	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
-	int bufsize;
+	int bcount;
 	int n, on, error = 0, iomode, must_commit;
 
 #ifdef DIAGNOSTIC
@@ -749,12 +754,9 @@ nfs_write(ap)
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
-	/*
-	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
-	 * will be the same size within a filesystem. nfs_writerpc will
-	 * still use nm_wsize when sizing the rpc's.
-	 */
+
 	biosize = vp->v_mount->mnt_stat.f_iosize;
+
 	do {
 		/*
 		 * Check for a valid write lease.
@@ -786,17 +788,74 @@ nfs_write(ap)
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
-		if (uio->uio_offset + n > np->n_size) {
+		/*
+		 * Handle direct append and file extension cases, calculate
+		 * unaligned buffer size.
+		 */
+
+		if (uio->uio_offset == np->n_size && n) {
+			/*
+			 * special append case.  Obtain buffer prior to
+			 * resizing it to maintain B_CACHE.
+			 */
+			long save;
+
+			bcount = on;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+			save = bp->b_flags & B_CACHE;
+
 			np->n_size = uio->uio_offset + n;
 			np->n_flag |= NMODIFIED;
 			vnode_pager_setsize(vp, np->n_size);
+
+			bcount += n;
+			allocbuf(bp, bcount);
+			bp->b_flags |= save;
+		} else {
+			if (uio->uio_offset + n > np->n_size) {
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
+			}
+			bcount = biosize;
+			if ((off_t)(lbn + 1) * biosize > np->n_size) 
+				bcount = np->n_size - (off_t)lbn * biosize;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+		}
+
+		/*
+		 * Issue a READ if B_CACHE is not set.  In special-append
+		 * mode, B_CACHE is based on the buffer prior to the write
+		 * op and is typically set, avoiding the read.  If a read
+		 * is required in special append mode, the server will
+		 * probably send us a short-read since we extended the file
+		 * on our end, resulting in b_resid == 0 and, thusly, 
+		 * B_CACHE getting set.
+		 *
+		 * We can also avoid issuing the read if the write covers
+		 * the entire buffer.  We have to make sure the buffer state
+		 * is reasonable in this case since we will not be initiating
+		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
+		 * more information.
+		 *
+		 * B_CACHE may also be set due to the buffer being cached
+		 * normally.
+		 */
+
+		if (on == 0 && n == bcount) {
+			bp->b_flags |= B_CACHE;
+			bp->b_flags &= ~(B_ERROR | B_INVAL);
 		}
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		if ((bp->b_flags & B_CACHE) == 0) {
+			bp->b_flags |= B_READ;
+			vfs_busy_pages(bp, 0);
+			error = nfs_doio(bp, cred, p);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
 		if (bp->b_wcred == NOCRED) {
@@ -820,6 +879,17 @@ again:
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
+		 *
+		 * While it is possible to merge discontiguous writes due to 
+		 * our having a B_CACHE buffer ( and thus valid read data
+		 * for the hole), we don't because it could lead to 
+		 * significant cache coherency problems with multiple clients,
+		 * especially if locking is implemented later on.
+		 *
+		 * as an optimization we could theoretically maintain
+		 * a linked list of discontinuous areas, but we would still
+		 * have to commit them separately so there isn't much
+		 * advantage to it except perhaps a bit of asynchronization.
 		 */
 
 		if (bp->b_dirtyend > 0 &&
@@ -863,11 +933,6 @@ again:
 		}
 
 		/*
-		 * This will keep the buffer and mmaped regions more coherent.
-		 */
-		nfs_prot_buf(bp, on, n);
-
-		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate 
 		 * condition.
 		 */
@@ -879,21 +944,7 @@ again:
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
-		}
-
-		/*
-		 * To avoid code complexity, we may have to throw away
-		 * previously valid ranges when merging the new dirty range
-		 * into the valid range.  As long as we do not *ADD* an
-		 * invalid valid range, we are ok.
-		 */
-		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
-		    bp->b_validoff > bp->b_dirtyend) {
-			bp->b_validoff = bp->b_dirtyoff;
-			bp->b_validend = bp->b_dirtyend;
-		} else {
-			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+			vfs_bio_set_validclean(bp, on, n);
 		}
 
 		/*
@@ -904,11 +955,14 @@ again:
 
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
+		 *
+		 * IO_INVAL appears to be unused.  The idea appears to be
+		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 			bp->b_proc = p;
 			if (ioflag & IO_INVAL)
-				bp->b_flags |= B_INVAL;
+				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp);
 			if (error)
 				return (error);
@@ -922,8 +976,9 @@ again:
 			bp->b_proc = (struct proc *)0;
 			bp->b_flags |= B_ASYNC;
 			(void)nfs_writebp(bp, 0);
-		} else
+		} else {
 			bdwrite(bp);
+		}
 	} while (uio->uio_resid > 0 && n > 0);
 	return (0);
 }
@@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
 				return ((struct buf *)0);
 			bp = getblk(vp, bn, size, 0, 2 * hz);
 		}
-	} else
+	} else {
 		bp = getblk(vp, bn, size, 0, 0);
+	}
 
 	if (vp->v_type == VREG) {
 		int biosize;
+
 		biosize = mp->mnt_stat.f_iosize;
 		bp->b_blkno = bn * (biosize / DEV_BSIZE);
 	}
-
 	return (bp);
 }
 
@@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
  */
 int
 nfs_asyncio(bp, cred)
@@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
 	struct vnode *vp;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
-	int error = 0, diff, len, iomode, must_commit = 0;
+	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 
@@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_procp = p;
 
+	/*
+	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
+	 * do this here so we do not have to do it in all the code that
+	 * calls us.
+	 */
+	bp->b_flags &= ~(B_ERROR | B_INVAL);
+
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
 
 	/*
@@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
 		if (!error) {
-		    bp->b_validoff = 0;
 		    if (uiop->uio_resid) {
 			/*
-			 * If len > 0, there is a hole in the file and
-			 * no writes after the hole have been pushed to
-			 * the server yet.
-			 * Just zero fill the rest of the valid area.
+			 * If we had a short read with no error, we must have
+			 * hit a file hole.  We should zero-fill the remainder.
+			 * This can also occur if the server hits the file EOF.
+			 *
+			 * Holes used to be able to occur due to pending 
+			 * writes, but that is not possible any longer.
 			 */
-			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
-				+ diff);
-			if (len > 0) {
-			    len = min(len, uiop->uio_resid);
-			    bzero((char *)bp->b_data + diff, len);
-			    bp->b_validend = diff + len;
-			} else
-			    bp->b_validend = diff;
-		    } else
-			bp->b_validend = bp->b_bcount;
+			int nread = bp->b_bcount - uiop->uio_resid;
+			int left  = bp->b_bcount - nread;
+
+			if (left > 0)
+				bzero((char *)bp->b_data + nread, left);
+			uiop->uio_resid = 0;
+		    }
 		}
 		if (p && (vp->v_flag & VTEXT) &&
 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
@@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
+		/*
+		 * end-of-directory sets B_INVAL but does not generate an
+		 * error.
+		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
-			&& bp->b_dirtyend == bp->b_bufsize)
+			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~B_NEEDCOMMIT;
diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c
index 1490f72..2267629 100644
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
  */
 
 /*
@@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };
 
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
  * There is a congestion window for outstanding rpcs maintained per mount
  * point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }
 
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }
 
 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
  */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;
 
-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
-		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+	++nfs_realign_test;
 
-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
-		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
-			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
 			}
+			n->m_len = 0;
+			break;
 		}
+		pm = &m->m_next;
+	}
 
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
+		while (m) {
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
+			m = m->m_next;
 		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }
 
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index a92bb22..6114d56 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
  */
 
 
@@ -408,9 +408,9 @@ nfs_access(ap)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
-				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+				bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
-				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+				aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
@@ -962,7 +962,7 @@ nfs_read(ap)
 
 	if (vp->v_type != VREG)
 		return (EPERM);
-	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 }
 
 /*
@@ -980,7 +980,7 @@ nfs_readlink(ap)
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
-	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
@@ -1985,7 +1985,7 @@ nfs_readdir(ap)
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
-	error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid)
 		nfsstats.direofcache_misses++;
@@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)
 
 {
 	register int len, left;
-	register struct dirent *dp;
+	register struct dirent *dp = NULL;
 	register u_int32_t *tl;
 	register caddr_t cp;
 	register int32_t t1, t2;
@@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
 	int attrflag;
 	int v3 = NFS_ISV3(vp);
 
-#ifndef nolint
-	dp = (struct dirent *)0;
-#endif
 #ifndef DIAGNOSTIC
-	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
-		(uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif
 
@@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
 		m_freem(mrep);
 	}
 	/*
-	 * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
@@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
 		struct vnode *a_bp;
 	} */ *ap;
 {
-
 	return (nfs_writebp(ap->a_bp, 1));
 }
 
 /*
  * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag.  We set
+ * B_CACHE if this is a VMIO buffer.
  */
 int
 nfs_writebp(bp, force)
@@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
 	if(!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");
 
-	if (bp->b_flags & B_INVAL)
-		bp->b_flags |= B_NOCACHE;
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return(0);
+	}
+
+	bp->b_flags |= B_CACHE;
 
 	/*
-	 * XXX we bundirty() the bp here.  Shouldn't we do it later after
-	 * the I/O has completed??
+	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
diff --git a/sys/nfsclient/nfsargs.h b/sys/nfsclient/nfsargs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfsargs.h
+++ b/sys/nfsclient/nfsargs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsclient/nfsstats.h b/sys/nfsclient/nfsstats.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfsstats.h
+++ b/sys/nfsclient/nfsstats.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsserver/nfs.h b/sys/nfsserver/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsserver/nfs.h
+++ b/sys/nfsserver/nfs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c
index 1490f72..2267629 100644
--- a/sys/nfsserver/nfs_srvsock.c
+++ b/sys/nfsserver/nfs_srvsock.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
  */
 
 /*
@@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };
 
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
  * There is a congestion window for outstanding rpcs maintained per mount
  * point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }
 
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }
 
 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
  */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;
 
-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
-		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+	++nfs_realign_test;
 
-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
-		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
-			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
 			}
+			n->m_len = 0;
+			break;
 		}
+		pm = &m->m_next;
+	}
 
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
+		while (m) {
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
+			m = m->m_next;
 		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }
 
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfsserver/nfsrvstats.h b/sys/nfsserver/nfsrvstats.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsserver/nfsrvstats.h
+++ b/sys/nfsserver/nfsrvstats.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index d2ce212..2e88ca7 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -78,6 +78,19 @@ struct iodone_chain {
 
 /*
  * The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
+ *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
+ *	originally requested buffer size and can serve as a bounds check
+ *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
+ *
+ *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
+ *	ranges of dirty data that need to be written to backing store.
+ *	The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
+ *	completes, b_resid is usually 0 indicating 100% success.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
@@ -109,8 +122,10 @@ struct buf {
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
+#if 0
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
+#endif
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
@@ -151,9 +166,24 @@ struct buf {
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
- *			The situation where B_DELWRI is set and B_CACHE gets
- *			cleared MUST be committed to disk so B_DELWRI can
- *			also be cleared.
+ *			The situation where B_DELWRI is set and B_CACHE is
+ *			clear MUST be committed to disk by getblk() so 
+ *			B_DELWRI can also be cleared.  See the comments for
+ *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
+ *			the caller is expected to clear B_ERROR|B_INVAL,
+ *			set B_READ, and initiate an I/O.
+ *
+ *			The 'entire buffer' is defined to be the range from
+ *			0 through b_bcount.
+ *
+ *	B_MALLOC	Request that the buffer be allocated from the malloc
+ *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ *	B_VMIO		Indicates that the buffer is tied into an VM object.
+ *			The buffer's data is always PAGE_SIZE aligned even
+ *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
+ *			always at least DEV_BSIZE aligned, though ).
+ *	
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
@@ -356,6 +386,7 @@ void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
+void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
@@ -371,6 +402,7 @@ int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
+
 #endif /* KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index d2ce212..2e88ca7 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -78,6 +78,19 @@ struct iodone_chain {
 
 /*
  * The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
+ *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
+ *	originally requested buffer size and can serve as a bounds check
+ *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
+ *
+ *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
+ *	ranges of dirty data that need to be written to backing store.
+ *	The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
+ *	completes, b_resid is usually 0 indicating 100% success.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
@@ -109,8 +122,10 @@ struct buf {
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
+#if 0
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
+#endif
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
@@ -151,9 +166,24 @@ struct buf {
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
- *			The situation where B_DELWRI is set and B_CACHE gets
- *			cleared MUST be committed to disk so B_DELWRI can
- *			also be cleared.
+ *			The situation where B_DELWRI is set and B_CACHE is
+ *			clear MUST be committed to disk by getblk() so 
+ *			B_DELWRI can also be cleared.  See the comments for
+ *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
+ *			the caller is expected to clear B_ERROR|B_INVAL,
+ *			set B_READ, and initiate an I/O.
+ *
+ *			The 'entire buffer' is defined to be the range from
+ *			0 through b_bcount.
+ *
+ *	B_MALLOC	Request that the buffer be allocated from the malloc
+ *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ *	B_VMIO		Indicates that the buffer is tied into an VM object.
+ *			The buffer's data is always PAGE_SIZE aligned even
+ *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
+ *			always at least DEV_BSIZE aligned, though ).
+ *	
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
@@ -356,6 +386,7 @@ void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
+void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
@@ -371,6 +402,7 @@ int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
+
 #endif /* KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 8821440..c80d0a5 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.13 (Berkeley) 4/21/95
- * $Id: ffs_inode.c,v 1.52 1999/01/07 16:14:16 bde Exp $
+ * $Id: ffs_inode.c,v 1.53 1999/01/28 00:57:54 dillon Exp $
  */
 
 #include "opt_quota.h"
@@ -452,6 +452,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	if ((bp->b_flags & B_CACHE) == 0) {
 		curproc->p_stats->p_ru.ru_inblock++;	/* pay for read */
 		bp->b_flags |= B_READ;
+		bp->b_flags &= ~(B_ERROR|B_INVAL);
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ffs_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c
index d4d82f0..c9ae4dd 100644
--- a/sys/ufs/mfs/mfs_vnops.c
+++ b/sys/ufs/mfs/mfs_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_vnops.c	8.11 (Berkeley) 5/22/95
- * $Id: mfs_vnops.c,v 1.42 1999/01/28 00:57:55 dillon Exp $
+ * $Id: mfs_vnops.c,v 1.43 1999/04/11 02:28:32 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -127,6 +127,9 @@ mfs_fsync(ap)
  *	We implement the B_FREEBUF strategy.  We can't just madvise()
  *	here because we have to do it in the correct order vs other bio
  *	requests, so we queue it.
+ *
+ *	Note: geteblk() sets B_INVAL.  We leave it set to guarentee buffer
+ *	throw-away on brelse()? XXX
  */
 
 static int
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 047f10f..882d52e 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -66,7 +66,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_fault.c,v 1.100 1999/02/17 09:08:29 dillon Exp $
+ * $Id: vm_fault.c,v 1.101 1999/02/25 06:00:52 alc Exp $
  */
 
 /*
@@ -409,6 +409,12 @@ readrest:
 					firstpindex = fs.first_pindex -
 						2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1);
 
+				/*
+				 * note: partially valid pages cannot be 
+				 * included in the lookahead - NFS piecemeal
+				 * writes will barf on it badly.
+				 */
+
 				for(tmppindex = fs.first_pindex - 1;
 					tmppindex >= firstpindex;
 					--tmppindex) {
@@ -552,12 +558,16 @@ readrest:
 			}
 			fs.first_m = NULL;
 
+			/*
+			 * Zero the page if necessary and mark it valid.
+			 */
 			if ((fs.m->flags & PG_ZERO) == 0) {
 				vm_page_zero_fill(fs.m);
-			}
-			else
+			} else {
 				cnt.v_ozfod++;
+			}
 			cnt.v_zfod++;
+			fs.m->valid = VM_PAGE_BITS_ALL;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			if (fs.object != fs.first_object) {
@@ -788,14 +798,24 @@ readrest:
 #endif
 
 	unlock_things(&fs);
-	fs.m->valid = VM_PAGE_BITS_ALL;
-	vm_page_flag_clear(fs.m, PG_ZERO);
+
+	/*
+	 * Sanity check: page must be completely valid or it is not fit to
+	 * map into user space.  vm_pager_get_pages() ensures this.
+	 */
+
+	if (fs.m->valid != VM_PAGE_BITS_ALL) {
+		vm_page_zero_invalid(fs.m, TRUE);
+		printf("Warning: page %p partially invalid on fault\n", fs.m);
+	}
 
 	pmap_enter(fs.map->pmap, vaddr, VM_PAGE_TO_PHYS(fs.m), prot, wired);
+
 	if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) {
 		pmap_prefault(fs.map->pmap, vaddr, fs.entry);
 	}
 
+	vm_page_flag_clear(fs.m, PG_ZERO);
 	vm_page_flag_set(fs.m, PG_MAPPED|PG_REFERENCED);
 	if (fault_flags & VM_FAULT_HOLD)
 		vm_page_hold(fs.m);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index e07ea63..0d85a94 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
- *	$Id: vm_page.c,v 1.128 1999/03/19 05:21:03 alc Exp $
+ *	$Id: vm_page.c,v 1.129 1999/04/05 19:38:29 julian Exp $
  */
 
 /*
@@ -1460,14 +1460,16 @@ vm_page_bits(int base, int size)
 }
 
 /*
- * set a page valid and clean.  May not block.
+ *	vm_page_set_validclean:
  *
- * In order to maintain consistancy due to the DEV_BSIZE granularity
- * of the valid bits, we have to zero non-DEV_BSIZE aligned portions of 
- * the page at the beginning and end of the valid range when the 
- * associated valid bits are not already set.
+ *	Sets portions of a page valid and clean.  The arguments are expected
+ *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
+ *	of any partial chunks touched by the range.  The invalid portion of
+ *	such chunks will be zero'd.
  *
- * (base + size) must be less then or equal to PAGE_SIZE.
+ *	This routine may not block.
+ *
+ *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(m, base, size)
@@ -1529,8 +1531,35 @@ vm_page_set_validclean(m, base, size)
 		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
 }
 
+#if 0
+
+void
+vm_page_set_dirty(m, base, size)
+	vm_page_t m;
+	int base;
+	int size;
+{
+	m->dirty |= vm_page_bits(base, size);
+}
+
+#endif
+
+void
+vm_page_clear_dirty(m, base, size)
+	vm_page_t m;
+	int base;
+	int size;
+{
+	m->dirty &= ~vm_page_bits(base, size);
+}
+
 /*
- * set a page (partially) invalid.  May not block.
+ *	vm_page_set_invalid:
+ *
+ *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
+ *	valid and dirty bits for the effected areas are cleared.
+ *
+ *	May not block.
  */
 void
 vm_page_set_invalid(m, base, size)
@@ -1540,9 +1569,9 @@ vm_page_set_invalid(m, base, size)
 {
 	int bits;
 
-	m->valid &= ~(bits = vm_page_bits(base, size));
-	if (m->valid == 0)
-		m->dirty &= ~bits;
+	bits = vm_page_bits(base, size);
+	m->valid &= ~bits;
+	m->dirty &= ~bits;
 	m->object->generation++;
 }
 
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 8072f66..abff794 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_page.h,v 1.58 1999/03/15 05:09:48 julian Exp $
+ * $Id: vm_page.h,v 1.59 1999/04/05 19:38:29 julian Exp $
  */
 
 /*
@@ -101,6 +101,10 @@
  *	Fields in this structure are locked either by the lock on the
  *	object that the page belongs to (O) or by the lock on the page
  *	queues (P).
+ *
+ *	The 'valid' and 'dirty' fields are distinct.  A page may have dirty
+ *	bits set without having associated valid bits set.  This is used by
+ *	NFS to implement piecemeal writes.
  */
 
 TAILQ_HEAD(pglist, vm_page);
@@ -404,6 +408,8 @@ void vm_page_wire __P((vm_page_t));
 void vm_page_unqueue __P((vm_page_t));
 void vm_page_unqueue_nowakeup __P((vm_page_t));
 void vm_page_set_validclean __P((vm_page_t, int, int));
+void vm_page_set_dirty __P((vm_page_t, int, int));
+void vm_page_clear_dirty __P((vm_page_t, int, int));
 void vm_page_set_invalid __P((vm_page_t, int, int));
 static __inline boolean_t vm_page_zero_fill __P((vm_page_t));
 int vm_page_is_valid __P((vm_page_t, int, int));
diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c
index 36a905e..dbacceb 100644
--- a/sys/vm/vm_pager.c
+++ b/sys/vm/vm_pager.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pager.c,v 1.44 1999/03/14 09:20:00 julian Exp $
+ * $Id: vm_pager.c,v 1.45 1999/04/11 02:16:27 eivind Exp $
  */
 
 /*
@@ -523,6 +523,9 @@ vm_pager_chain_iodone(struct buf *nbp)
  *	Obtain a physical buffer and chain it to its parent buffer.  When
  *	I/O completes, the parent buffer will be B_SIGNAL'd.  Errors are
  *	automatically propogated to the parent
+ *
+ *	Since these are brand new buffers, we do not have to clear B_INVAL
+ *	and B_ERROR because they are already clear.
  */
 
 struct buf *
diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h
index 82b6574..aff14ab 100644
--- a/sys/vm/vm_pager.h
+++ b/sys/vm/vm_pager.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $Id: vm_pager.h,v 1.20 1999/01/24 02:32:15 dillon Exp $
+ * $Id: vm_pager.h,v 1.21 1999/03/14 09:20:00 julian Exp $
  */
 
 /*
@@ -110,6 +110,14 @@ void flushchainbuf(struct buf *nbp);
 void waitchainbuf(struct buf *bp, int count, int done);
 void autochaindone(struct buf *bp);
 
+/*
+ *	vm_page_get_pages:
+ *
+ *	Retrieve pages from the VM system in order to map them into an object
+ *	( or into VM space somewhere ).  If the pagein was successful, we
+ *	must fully validate it.
+ */
+
 static __inline int
 vm_pager_get_pages(
 	vm_object_t object,
@@ -117,7 +125,13 @@ vm_pager_get_pages(
 	int count,
 	int reqpage
 ) {
-	return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
+	int r;
+
+	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
+	if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
+		vm_page_zero_invalid(m[reqpage], TRUE);
+	}
+	return(r);
 }
 
 static __inline void
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 628bec7..83f379a 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
- *	$Id: vnode_pager.c,v 1.106 1999/04/05 19:38:29 julian Exp $
+ *	$Id: vnode_pager.c,v 1.107 1999/04/10 20:52:11 dt Exp $
  */
 
 /*
@@ -789,7 +789,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 			 * read.
 			 */
 			vm_page_set_validclean(mt, 0, size - tfoff);
-			vm_page_zero_invalid(mt, FALSE);
+			/* handled by vm_fault now */
+			/* vm_page_zero_invalid(mt, FALSE); */
 		}
 		
 		vm_page_flag_clear(mt, PG_ZERO);