The VFS/BIO subsystem contained a number of hacks in order to optimize

piecemeal, middle-of-file writes for NFS. These hacks have caused no end of trouble, especially when combined with mmap(). I've removed them. Instead, NFS will issue a read-before-write to fully instantiate the struct buf containing the write. NFS does, however, optimize piecemeal appends to files. For most common file operations, you will not notice the difference. The sole remaining fragment in the VFS/BIO system is b_dirtyoff/end, which NFS uses to avoid cache coherency issues with read-merge-write style operations. NFS also optimizes the write-covers-entire-buffer case by avoiding the read-before-write. There is quite a bit of room for further optimization in these areas. The VM system marks pages fully-valid (AKA vm_page_t->valid = VM_PAGE_BITS_ALL) in several places, most noteably in vm_fault. This is not correct operation. The vm_pager_get_pages() code is now responsible for marking VM pages all-valid. A number of VM helper routines have been added to aid in zeroing-out the invalid portions of a VM page prior to the page being marked all-valid. This operation is necessary to properly support mmap(). The zeroing occurs most often when dealing with file-EOF situations. Several bugs have been fixed in the NFS subsystem, including bits handling file and directory EOF situations and buf->b_flags consistancy issues relating to clearing B_ERROR & B_INVAL, and handling B_DONE. getblk() and allocbuf() have been rewritten. B_CACHE operation is now formally defined in comments and more straightforward in implementation. B_CACHE for VMIO buffers is based on the validity of the backing store. B_CACHE for non-VMIO buffers is based simply on whether the buffer is B_INVAL or not (B_CACHE set if B_INVAL clear, and vise-versa). biodone() is now responsible for setting B_CACHE when a successful read completes. B_CACHE is also set when a bdwrite() is initiated and when a bwrite() is initiated. VFS VOP_BWRITE routines (there are only two - nfs_bwrite() and bwrite()) are now expected to set B_CACHE. This means that bowrite() and bawrite() also set B_CACHE indirectly. There are a number of places in the code which were previously using buf->b_bufsize (which is DEV_BSIZE aligned) when they should have been using buf->b_bcount. These have been fixed. getblk() now clears B_DONE on return because the rest of the system is so bad about dealing with B_DONE. Major fixes to NFS/TCP have been made. A server-side bug could cause requests to be lost by the server due to nfs_realign() overwriting other rpc's in the same TCP mbuf chain. The server's kernel must be recompiled to get the benefit of the fixes. Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
author: alc <alc@FreeBSD.org> 1999-05-02 23:57:16 +0000
committer: alc <alc@FreeBSD.org> 1999-05-02 23:57:16 +0000
commit: 5cb08a2652f36ddab7172faf6b766038472c1647 (patch)
tree: c47eaa3332628f6c725ca32dda81aa44d24e2ac2 /sys
parent: c75d7e89c3e63bc9b8e9863a5cc985649edf5f9a (diff)
download: FreeBSD-src-5cb08a2652f36ddab7172faf6b766038472c1647.zip
FreeBSD-src-5cb08a2652f36ddab7172faf6b766038472c1647.tar.gz
30 files changed, 1414 insertions, 998 deletions
diff --git a/sys/gnu/ext2fs/ext2_bmap.c b/sys/gnu/ext2fs/ext2_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/gnu/ext2fs/ext2_bmap.c
+++ b/sys/gnu/ext2fs/ext2_bmap.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
diff --git a/sys/gnu/fs/ext2fs/ext2_bmap.c b/sys/gnu/fs/ext2fs/ext2_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/gnu/fs/ext2fs/ext2_bmap.c
+++ b/sys/gnu/fs/ext2fs/ext2_bmap.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 803aab1..cb18320 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.206 1999/04/14 18:51:52 dt Exp $
+ * $Id: vfs_bio.c,v 1.207 1999/04/29 18:15:25 alc Exp $
  */
 
 /*
@@ -74,9 +74,6 @@ static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
-static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
-			      vm_offset_t off, vm_offset_t size,
-			      vm_page_t m);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
 			       int pageno, vm_page_t m);
 static void vfs_clean_pages(struct buf * bp);
@@ -222,6 +219,27 @@ bufcountwakeup(void)
 }
 
 /*
+ *	vfs_buf_test_cache:
+ *
+ *	Called when a buffer is extended.  This function clears the B_CACHE
+ *	bit if the newly extended portion of the buffer does not contain
+ *	valid data.
+ */
+static __inline__
+void
+vfs_buf_test_cache(struct buf *bp,
+		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+		  vm_page_t m)
+{
+	if (bp->b_flags & B_CACHE) {
+		int base = (foff + off) & PAGE_MASK;
+		if (vm_page_is_valid(m, base, size) == 0)
+			bp->b_flags &= ~B_CACHE;
+	}
+}
+
+
+/*
  * Initialize buffer headers and related structures.
  */
 void
@@ -371,7 +389,10 @@ bremfree(struct buf * bp)
 
 
 /*
- * Get a buffer with the specified data.  Look in the cache first.
+ * Get a buffer with the specified data.  Look in the cache first.  We
+ * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything ( see
+ * getblk() ).
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
@@ -388,7 +409,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
 			curproc->p_stats->p_ru.ru_inblock++;
 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
 		bp->b_flags |= B_READ;
-		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		bp->b_flags &= ~(B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
@@ -403,7 +424,9 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.
+ * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
+ * to initiating I/O . If B_CACHE is set, the buffer is valid 
+ * and we do not have to do anything.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
@@ -421,7 +444,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_flags |= B_READ;
-		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		bp->b_flags &= ~(B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
@@ -441,7 +464,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
 			if (curproc != NULL)
 				curproc->p_stats->p_ru.ru_inblock++;
 			rabp->b_flags |= B_READ | B_ASYNC;
-			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+			rabp->b_flags &= ~(B_ERROR | B_INVAL);
 			if (rabp->b_rcred == NOCRED) {
 				if (cred != NOCRED)
 					crhold(cred);
@@ -462,7 +485,14 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
 
 /*
  * Write, release buffer on completion.  (Done by iodone
- * if async.)
+ * if async).  Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable.  This is true even of NFS
+ * now so we set it generally.  This could be set either here 
+ * or in biodone() since the I/O is synchronous.  We put it
+ * here.
  */
 int
 bwrite(struct buf * bp)
@@ -486,7 +516,7 @@ bwrite(struct buf * bp)
 	bundirty(bp);
 
 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
-	bp->b_flags |= B_WRITEINPROG;
+	bp->b_flags |= B_WRITEINPROG | B_CACHE;
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
@@ -505,11 +535,12 @@ bwrite(struct buf * bp)
 			mp = vp->v_specmountpoint;
 		else
 			mp = vp->v_mount;
-		if (mp != NULL)
+		if (mp != NULL) {
 			if ((oldflags & B_ASYNC) == 0)
 				mp->mnt_stat.f_syncwrites++;
 			else
 				mp->mnt_stat.f_asyncwrites++;
+		}
 	}
 
 	if ((oldflags & B_ASYNC) == 0) {
@@ -522,7 +553,13 @@ bwrite(struct buf * bp)
 }
 
 /*
- * Delayed write. (Buffer is marked dirty).
+ * Delayed write. (Buffer is marked dirty).  Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
  */
 void
 bdwrite(struct buf * bp)
@@ -542,6 +579,12 @@ bdwrite(struct buf * bp)
 	bdirty(bp);
 
 	/*
+	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
+	 * true even of NFS now.
+	 */
+	bp->b_flags |= B_CACHE;
+
+	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
@@ -592,8 +635,11 @@ bdwrite(struct buf * bp)
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
- *	clears B_DONE ( else a panic will occur later ).  Note that B_INVALID 
- *	buffers are not considered dirty even if B_DELWRI is set.	
+ *	clears B_DONE ( else a panic will occur later ).  
+ *
+ *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
+ *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
@@ -645,6 +691,9 @@ bundirty(bp)
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
+ *
+ *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
+ *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf * bp)
@@ -658,7 +707,8 @@ bawrite(struct buf * bp)
  *
  *	Ordered write.  Start output on a buffer, and flag it so that the 
  *	device will write it in the order it was queued.  The buffer is 
- *	released when the output completes.
+ *	released when the output completes.  bwrite() ( or the VOP routine
+ *	anyway ) is responsible for handling B_INVAL buffers.
  */
 int
 bowrite(struct buf * bp)
@@ -694,10 +744,19 @@ brelse(struct buf * bp)
 		bp->b_flags &= ~B_ERROR;
 
 	if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) {
+		/*
+		 * Failed write, redirty.  Must clear B_ERROR to prevent
+		 * pages from being scrapped.  Note: B_INVAL is ignored
+		 * here but will presumably be dealt with later.
+		 */
 		bp->b_flags &= ~B_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
 	    (bp->b_bufsize <= 0)) {
+		/*
+		 * Either a failed I/O or we were asked to free or not
+		 * cache the buffer.
+		 */
 		bp->b_flags |= B_INVAL;
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
@@ -727,31 +786,22 @@ brelse(struct buf * bp)
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
-	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
-	 * but the VM object is kept around.  The B_NOCACHE flag is used to
-	 * invalidate the pages in the VM object.
+	 * constituted, not even NFS buffers now.  Two flags effect this.  If
+	 * B_INVAL, the struct buf is invalidated but the VM object is kept
+	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
-	 * The b_{validoff,validend,dirtyoff,dirtyend} values are relative 
-	 * to b_offset and currently have byte granularity, whereas the
-	 * valid flags in the vm_pages have only DEV_BSIZE resolution.
-	 * The byte resolution fields are used to avoid unnecessary re-reads
-	 * of the buffer but the code really needs to be genericized so
-	 * other filesystem modules can take advantage of these fields.
+	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
+	 * invalidated.  B_ERROR cannot be set for a failed write unless the
+	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
-	 * XXX this seems to cause performance problems.
+	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
+	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
+	 * the commit state and we cannot afford to lose the buffer.
 	 */
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 bp->b_vp->v_type != VBLK &&
-		 (bp->b_flags & B_DELWRI) != 0)
-#ifdef notdef
-	    && (bp->b_vp->v_tag != VT_NFS
-		|| bp->b_vp->v_type == VBLK
-		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
-		|| bp->b_validend == 0
-		|| (bp->b_validoff == 0
-		    && bp->b_validend == bp->b_bufsize))
-#endif
+		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
@@ -912,6 +962,11 @@ brelse(struct buf * bp)
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion.  It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
  */
 void
 bqrelse(struct buf * bp)
@@ -1096,6 +1151,8 @@ vfs_bio_awrite(struct buf * bp)
 	splx(s);
 	/*
 	 * default (old) behavior, writing out only one block
+	 *
+	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) VOP_BWRITE(bp);
@@ -1107,7 +1164,11 @@ vfs_bio_awrite(struct buf * bp)
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers 
- *	in the bufqueues as necessary.
+ *	in the bufqueues as necessary.  The new buffer is returned with
+ *	flags set to B_BUSY.
+ *
+ *	Important:  B_INVAL is not set.  If the caller wishes to throw the
+ *	buffer away, the caller must set B_INVAL prior to calling brelse().
  *
  *	We block if:
  *		We have insufficient buffer headers
@@ -1368,7 +1429,6 @@ restart:
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
-		bp->b_validoff = bp->b_validend = 0;
 		bp->b_usecount = 5;
 
 		LIST_INIT(&bp->b_dep);
@@ -1465,7 +1525,10 @@ dosleep:
 		}
 		bp->b_data = bp->b_kvabase;
 	}
-	
+
+	/*
+	 * The bp, if valid, is set to B_BUSY.
+	 */
 	return (bp);
 }
 
@@ -1546,9 +1609,10 @@ flushbufqueues(void)
 		}
 
 		/*
-		 * XXX NFS does weird things with B_INVAL bps if we bwrite
-		 * them ( vfs_bio_awrite/bawrite/bdwrite/etc )  Why?
-		 *
+		 * Try to free up B_INVAL delayed-write buffers rather then
+		 * writing them out.  Note also that NFS is somewhat sensitive
+		 * to B_INVAL buffers so it is doubly important that we do 
+		 * this.
 		 */
 		if ((bp->b_flags & B_DELWRI) != 0) {
 			if (bp->b_flags & B_INVAL) {
@@ -1622,20 +1686,28 @@ inmem(struct vnode * vp, daddr_t blkno)
 }
 
 /*
- * now we set the dirty range for the buffer --
- * for NFS -- if the file is mapped and pages have
- * been written to, let it know.  We want the
- * entire range of the buffer to be marked dirty if
- * any of the pages have been written to for consistancy
- * with the b_validoff, b_validend set in the nfs write
- * code, and used by the nfs read code.
+ *	vfs_setdirty:
+ *
+ *	Sets the dirty range for a buffer based on the status of the dirty
+ *	bits in the pages comprising the buffer.
+ *
+ *	The range is limited to the size of the buffer.
+ *
+ *	This routine is primarily used by NFS, but is generalized for the
+ *	B_VMIO case.
  */
 static void
 vfs_setdirty(struct buf *bp) 
 {
 	int i;
 	vm_object_t object;
-	vm_offset_t boffset;
+
+	/*
+	 * Degenerate case - empty buffer
+	 */
+
+	if (bp->b_bufsize == 0)
+		return;
 
 	/*
 	 * We qualify the scan for modified pages on whether the
@@ -1654,6 +1726,9 @@ vfs_setdirty(struct buf *bp)
 		printf("Warning: object %p mightbedirty but not writeable\n", object);
 
 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
+		vm_offset_t boffset;
+		vm_offset_t eoffset;
+
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
@@ -1664,47 +1739,85 @@ vfs_setdirty(struct buf *bp)
 		}
 
 		/*
-		 * scan forwards for the first page modified
+		 * Calculate the encompassing dirty range, boffset and eoffset,
+		 * (eoffset - boffset) bytes.
 		 */
+
 		for (i = 0; i < bp->b_npages; i++) {
-			if (bp->b_pages[i]->dirty) {
+			if (bp->b_pages[i]->dirty)
 				break;
-			}
 		}
-
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
-		if (boffset < bp->b_dirtyoff) {
-			bp->b_dirtyoff = max(boffset, 0);
-		}
 
-		/*
-		 * scan backwards for the last page modified
-		 */
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
-		boffset = (i + 1);
-#if 0
-		offset = boffset + bp->b_pages[0]->pindex;
-		if (offset >= object->size)
-			boffset = object->size - bp->b_pages[0]->pindex;
-#endif
-		boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
-		if (bp->b_dirtyend < boffset)
-			bp->b_dirtyend = min(boffset, bp->b_bufsize);
+		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		/*
+		 * Fit it to the buffer.
+		 */
+
+		if (eoffset > bp->b_bcount)
+			eoffset = bp->b_bcount;
+
+		/*
+		 * If we have a good dirty range, merge with the existing
+		 * dirty range.
+		 */
+
+		if (boffset < eoffset) {
+			if (bp->b_dirtyoff > boffset)
+				bp->b_dirtyoff = boffset;
+			if (bp->b_dirtyend < eoffset)
+				bp->b_dirtyend = eoffset;
+		}
 	}
 }
 
 /*
- * Get a block given a specified block and offset into a file/device.
+ *	getblk:
+ *
+ *	Get a block given a specified block and offset into a file/device.
+ *	The buffers B_DONE bit will be cleared on return, making it almost
+ * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
+ *	return.  The caller should clear B_INVAL prior to initiating a
+ *	READ.
+ *
+ *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ *	an existing buffer.
+ *
+ *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ *	and then cleared based on the backing VM.  If the previous buffer is
+ *	non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ *	If getblk() must create a new buffer, the new buffer is returned with
+ *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ *	case it is returned with B_INVAL clear and B_CACHE set based on the
+ *	backing VM.
+ *
+ *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
+ *	B_CACHE bit is clear.
+ *	
+ *	What this means, basically, is that the caller should use B_CACHE to
+ *	determine whether the buffer is fully valid or not and should clear
+ *	B_INVAL prior to issuing a read.  If the caller intends to validate
+ *	the buffer by loading its data area with something, the caller needs
+ *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
+ *	the caller should set B_CACHE ( as an optimization ), else the caller
+ *	should issue the I/O and biodone() will set B_CACHE if the I/O was
+ *	a write attempt or if it was a successfull read.  If the caller 
+ *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
+ *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
-	int i, s;
+	int s;
 	struct bufhashhdr *bh;
 
 #if !defined(MAX_PERF)
@@ -1727,6 +1840,10 @@ loop:
 	}
 
 	if ((bp = gbincore(vp, blkno))) {
+		/*
+		 * Buffer is in-core
+		 */
+
 		if (bp->b_flags & B_BUSY) {
 			bp->b_flags |= B_WANTED;
 			if (bp->b_usecount < BUF_MAXUSE)
@@ -1740,7 +1857,18 @@ loop:
 			splx(s);
 			return (struct buf *) NULL;
 		}
-		bp->b_flags |= B_BUSY | B_CACHE;
+
+		/*
+		 * Busy the buffer.  B_CACHE is cleared if the buffer is 
+		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
+		 * and for a VMIO buffer B_CACHE is adjusted according to the
+		 * backing VM cache.
+		 */
+		bp->b_flags |= B_BUSY;
+		if (bp->b_flags & B_INVAL)
+			bp->b_flags &= ~B_CACHE;
+		else if ((bp->b_flags & (B_VMIO|B_INVAL)) == 0)
+			bp->b_flags |= B_CACHE;
 		bremfree(bp);
 
 		/*
@@ -1770,7 +1898,9 @@ loop:
 
 		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
-		 * the buffer.  This might lead to B_CACHE getting cleared.
+		 * the buffer.  This might lead to B_CACHE getting set or
+		 * cleared.  If the size has not changed, B_CACHE remains
+		 * unchanged from its previous state.
 		 */
 
 		if (bp->b_bcount != size)
@@ -1780,45 +1910,19 @@ loop:
 		    ("getblk: no buffer offset"));
 
 		/*
-		 * Check that the constituted buffer really deserves for the
-		 * B_CACHE bit to be set.  B_VMIO type buffers might not
-		 * contain fully valid pages.  Normal (old-style) buffers
-		 * should be fully valid.  This might also lead to B_CACHE
-		 * getting clear.
+		 * A buffer with B_DELWRI set and B_CACHE clear must
+		 * be committed before we can return the buffer in
+		 * order to prevent the caller from issuing a read
+		 * ( due to B_CACHE not being set ) and overwriting
+		 * it.
 		 *
-		 * If B_CACHE is already clear, don't bother checking to see 
-		 * if we have to clear it again.
-		 *
-		 * XXX this code should not be necessary unless the B_CACHE
-		 * handling is broken elsewhere in the kernel.  We need to
-		 * check the cases and then turn the clearing part of this
-		 * code into a panic.
-		 */
-		if (
-		    (bp->b_flags & (B_VMIO|B_CACHE)) == (B_VMIO|B_CACHE) &&
-		    (bp->b_vp->v_tag != VT_NFS || bp->b_validend <= 0)
-		) {
-			int checksize = bp->b_bufsize;
-			int poffset = bp->b_offset & PAGE_MASK;
-			int resid;
-			for (i = 0; i < bp->b_npages; i++) {
-				resid = (checksize > (PAGE_SIZE - poffset)) ?
-					(PAGE_SIZE - poffset) : checksize;
-				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
-					bp->b_flags &= ~(B_CACHE | B_DONE);
-					break;
-				}
-				checksize -= resid;
-				poffset = 0;
-			}
-		}
-
-		/*
-		 * If B_DELWRI is set and B_CACHE got cleared ( or was
-		 * already clear ), we have to commit the write and
-		 * retry.  The NFS code absolutely depends on this,
-		 * and so might the FFS code.  In anycase, it formalizes
-		 * the B_CACHE rules.  See sys/buf.h.
+		 * Most callers, including NFS and FFS, need this to
+		 * operate properly either because they assume they
+		 * can issue a read if B_CACHE is not set, or because
+		 * ( for example ) an uncached B_DELWRI might loop due 
+		 * to softupdates re-dirtying the buffer.  In the latter
+		 * case, B_CACHE is set after the first write completes,
+		 * preventing further loops.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
@@ -1829,8 +1933,14 @@ loop:
 		if (bp->b_usecount < BUF_MAXUSE)
 			++bp->b_usecount;
 		splx(s);
-		return (bp);
+		bp->b_flags &= ~B_DONE;
 	} else {
+		/*
+		 * Buffer is not in-core, create new buffer.  The buffer
+		 * returned by getnewbuf() is marked B_BUSY.  Note that the
+		 * returned buffer is also considered valid ( not marked
+		 * B_INVAL ).
+		 */
 		int bsize, maxsize, vmio;
 		off_t offset;
 
@@ -1849,7 +1959,7 @@ loop:
 		maxsize = imax(maxsize, bsize);
 
 		if ((bp = getnewbuf(vp, blkno,
-			slpflag, slptimeo, size, maxsize)) == 0) {
+			slpflag, slptimeo, size, maxsize)) == NULL) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
@@ -1861,6 +1971,10 @@ loop:
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
+		 * If the buffer is created out from under us, we have to
+		 * throw away the one we just created.  There is now window
+		 * race because we are safely running at splbio() from the
+		 * point of the duplicate buffer creation through to here.
 		 */
 		if (gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
@@ -1880,8 +1994,15 @@ loop:
 		bh = BUFHASH(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
+		/*
+		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
+		 * buffer size starts out as 0, B_CACHE will be set by
+		 * allocbuf() for the VMIO case prior to it testing the
+		 * backing store for validity.
+		 */
+
 		if (vmio) {
-			bp->b_flags |= (B_VMIO | B_CACHE);
+			bp->b_flags |= B_VMIO;
 #if defined(VFS_BIO_DEBUG)
 			if (vp->v_type != VREG && vp->v_type != VBLK)
 				printf("getblk: vmioing file type %d???\n", vp->v_type);
@@ -1893,12 +2014,14 @@ loop:
 		allocbuf(bp, size);
 
 		splx(s);
-		return (bp);
+		bp->b_flags &= ~B_DONE;
 	}
+	return (bp);
 }
 
 /*
- * Get an empty, disassociated buffer of given size.
+ * Get an empty, disassociated buffer of given size.  The buffer is initially
+ * set to B_INVAL.
  */
 struct buf *
 geteblk(int size)
@@ -1910,7 +2033,7 @@ geteblk(int size)
 	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
 	splx(s);
 	allocbuf(bp, size);
-	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	return (bp);
 }
 
@@ -1925,6 +2048,9 @@ geteblk(int size)
  * deadlock or inconsistant data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
+ * B_CACHE for the non-VMIO case.
  */
 
 int
@@ -1945,7 +2071,8 @@ allocbuf(struct buf *bp, int size)
 		caddr_t origbuf;
 		int origbufsize;
 		/*
-		 * Just get anonymous memory from the kernel
+		 * Just get anonymous memory from the kernel.  Don't
+		 * mess with B_CACHE.
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 #if !defined(NO_B_MALLOC)
@@ -2046,13 +2173,25 @@ allocbuf(struct buf *bp, int size)
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 #endif
+		/*
+		 * Set B_CACHE initially if buffer is 0 length or will become
+		 * 0-length.
+		 */
+		if (size == 0 || bp->b_bufsize == 0)
+			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize) {
+			/*
+			 * DEV_BSIZE aligned new buffer size is less then the
+			 * DEV_BSIZE aligned existing buffer size.  Figure out
+			 * if we have to remove any pages.
+			 */
 			if (desiredpages < bp->b_npages) {
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
-					 * is the responsibility of vnode_pager_setsize
+					 * is the responsibility of 
+					 * vnode_pager_setsize
 					 */
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
@@ -2067,115 +2206,131 @@ allocbuf(struct buf *bp, int size)
 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
-		} else if (newbsize > bp->b_bufsize) {
-			vm_object_t obj;
-			vm_offset_t tinc, toff;
-			vm_ooffset_t off;
-			vm_pindex_t objoff;
-			int pageindex, curbpnpages;
+		} else if (size > bp->b_bcount) {
+			/*
+			 * We are growing the buffer, possibly in a 
+			 * byte-granular fashion.
+			 */
 			struct vnode *vp;
-			int bsize;
-			int orig_validoff = bp->b_validoff;
-			int orig_validend = bp->b_validend;
-
-			vp = bp->b_vp;
-
-			if (vp->v_type == VBLK)
-				bsize = DEV_BSIZE;
-			else
-				bsize = vp->v_mount->mnt_stat.f_iosize;
-
-			if (bp->b_npages < desiredpages) {
-				obj = vp->v_object;
-				tinc = PAGE_SIZE;
+			vm_object_t obj;
+			vm_offset_t toff;
+			vm_offset_t tinc;
 
-				off = bp->b_offset;
-				KASSERT(bp->b_offset != NOOFFSET,
-				    ("allocbuf: no buffer offset"));
-				curbpnpages = bp->b_npages;
-		doretry:
-				bp->b_validoff = orig_validoff;
-				bp->b_validend = orig_validend;
-				bp->b_flags |= B_CACHE;
-				for (toff = 0; toff < newbsize; toff += tinc) {
-					objoff = OFF_TO_IDX(off + toff);
-					pageindex = objoff - OFF_TO_IDX(off);
-					tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
-					if (pageindex < curbpnpages) {
-
-						m = bp->b_pages[pageindex];
-#ifdef VFS_BIO_DIAG
-						if (m->pindex != objoff)
-							panic("allocbuf: page changed offset?!!!?");
-#endif
-						if (tinc > (newbsize - toff))
-							tinc = newbsize - toff;
-						if (bp->b_flags & B_CACHE)
-							vfs_buf_set_valid(bp, off, toff, tinc, m);
-						continue;
-					}
-					m = vm_page_lookup(obj, objoff);
-					if (!m) {
-						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
-						if (!m) {
-							VM_WAIT;
-							vm_pageout_deficit += (desiredpages - curbpnpages);
-							goto doretry;
-						}
+			/*
+			 * Step 1, bring in the VM pages from the object, 
+			 * allocating them if necessary.  We must clear
+			 * B_CACHE if these pages are not valid for the 
+			 * range covered by the buffer.
+			 */
 
+			vp = bp->b_vp;
+			obj = vp->v_object;
+
+			while (bp->b_npages < desiredpages) {
+				vm_page_t m;
+				vm_pindex_t pi;
+
+				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
+				if ((m = vm_page_lookup(obj, pi)) == NULL) {
+					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
+					if (m == NULL) {
+						VM_WAIT;
+						vm_pageout_deficit += desiredpages - bp->b_npages;
+					} else {
 						vm_page_wire(m);
 						vm_page_wakeup(m);
 						bp->b_flags &= ~B_CACHE;
-
-					} else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) {
-						/*
-						 *  If we had to sleep, retry.
-						 *
-						 *  Also note that we only test
-						 *  PG_BUSY here, not m->busy.
-						 *  
-						 *  We cannot sleep on m->busy
-						 *  here because a vm_fault ->
-						 *  getpages -> cluster-read ->
-						 *  ...-> allocbuf sequence 
-						 *  will convert PG_BUSY to
-						 *  m->busy so we have to let 
-						 *  m->busy through if we do 
-						 *  not want to deadlock.
-						 */
-						goto doretry;
-					} else {
-						if ((curproc != pageproc) &&
-							((m->queue - m->pc) == PQ_CACHE) &&
-						    ((cnt.v_free_count + cnt.v_cache_count) <
-								(cnt.v_free_min + cnt.v_cache_min))) {
-							pagedaemon_wakeup();
-						}
-						if (tinc > (newbsize - toff))
-							tinc = newbsize - toff;
-						if (bp->b_flags & B_CACHE)
-							vfs_buf_set_valid(bp, off, toff, tinc, m);
-						vm_page_flag_clear(m, PG_ZERO);
-						vm_page_wire(m);
+						bp->b_pages[bp->b_npages] = m;
+						++bp->b_npages;
 					}
-					bp->b_pages[pageindex] = m;
-					curbpnpages = pageindex + 1;
+					continue;
 				}
-				if (vp->v_tag == VT_NFS && 
-				    vp->v_type != VBLK) {
-					if (bp->b_dirtyend > 0) {
-						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
-					}
-					if (bp->b_validend == 0)
-						bp->b_flags &= ~B_CACHE;
+
+				/*
+				 * We found a page.  If we have to sleep on it,
+				 * retry because it might have gotten freed out
+				 * from under us.
+				 *
+				 * We can only test PG_BUSY here.  Blocking on
+				 * m->busy might lead to a deadlock:
+				 *
+				 *  vm_fault->getpages->cluster_read->allocbuf
+				 *
+				 */
+
+				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
+					continue;
+
+				/*
+				 * We have a good page.  Should we wakeup the
+				 * page daemon?
+				 */
+				if ((curproc != pageproc) &&
+				    ((m->queue - m->pc) == PQ_CACHE) &&
+				    ((cnt.v_free_count + cnt.v_cache_count) <
+					(cnt.v_free_min + cnt.v_cache_min))
+				) {
+					pagedaemon_wakeup();
 				}
-				bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
-				bp->b_npages = curbpnpages;
-				pmap_qenter((vm_offset_t) bp->b_data,
-					bp->b_pages, bp->b_npages);
-				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
+				vm_page_flag_clear(m, PG_ZERO);
+				vm_page_wire(m);
+				bp->b_pages[bp->b_npages] = m;
+				++bp->b_npages;
 			}
+
+			/*
+			 * Step 2.  We've loaded the pages into the buffer,
+			 * we have to figure out if we can still have B_CACHE
+			 * set.  Note that B_CACHE is set according to the
+			 * byte-granular range ( bcount and size ), new the
+			 * aligned range ( newbsize ).
+			 *
+			 * The VM test is against m->valid, which is DEV_BSIZE
+			 * aligned.  Needless to say, the validity of the data
+			 * needs to also be DEV_BSIZE aligned.  Note that this
+			 * fails with NFS if the server or some other client
+			 * extends the file's EOF.  If our buffer is resized, 
+			 * B_CACHE may remain set! XXX
+			 */
+
+			toff = bp->b_bcount;
+			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+
+			while ((bp->b_flags & B_CACHE) && toff < size) {
+				vm_pindex_t pi;
+
+				if (tinc > (size - toff))
+					tinc = size - toff;
+
+				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
+				    PAGE_SHIFT;
+
+				vfs_buf_test_cache(
+				    bp, 
+				    bp->b_offset,
+				    toff, 
+				    tinc, 
+				    bp->b_pages[pi]
+				);
+				toff += tinc;
+				tinc = PAGE_SIZE;
+			}
+
+			/*
+			 * Step 3, fixup the KVM pmap.  Remember that
+			 * bp->b_data is relative to bp->b_offset, but 
+			 * bp->b_offset may be offset into the first page.
+			 */
+
+			bp->b_data = (caddr_t)
+			    trunc_page((vm_offset_t)bp->b_data);
+			pmap_qenter(
+			    (vm_offset_t)bp->b_data,
+			    bp->b_pages, 
+			    bp->b_npages
+			);
+			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
+			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
 	}
 	if (bp->b_flags & B_VMIO)
@@ -2184,13 +2339,17 @@ allocbuf(struct buf *bp, int size)
 	runningbufspace += (newbsize - bp->b_bufsize);
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
-	bp->b_bufsize = newbsize;
-	bp->b_bcount = size;
+	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
+	bp->b_bcount = size;		/* requested buffer size	*/
 	return 1;
 }
 
 /*
- * Wait for buffer I/O completion, returning error status.
+ *	biowait:
+ *
+ *	Wait for buffer I/O completion, returning error status.  The buffer
+ *	is left B_BUSY|B_DONE on return.  B_EINTR is converted into a EINTR
+ *	error and cleared.
  */
 int
 biowait(register struct buf * bp)
@@ -2220,9 +2379,23 @@ biowait(register struct buf * bp)
 }
 
 /*
- * Finish I/O on a buffer, calling an optional function.
- * This is usually called from interrupt level, so process blocking
- * is not *a good idea*.
+ *	biodone:
+ *
+ *	Finish I/O on a buffer, optionally calling a completion function.
+ *	This is usually called from an interrupt so process blocking is
+ *	not allowed.
+ *
+ *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
+ *	assuming B_INVAL is clear.
+ *
+ *	For the VMIO case, we set B_CACHE if the op was a read and no
+ *	read error occured, or if the op was a write.  B_CACHE is never
+ *	set if the buffer is invalid or otherwise uncacheable.
+ *
+ *	biodone does not mess with B_INVAL, allowing the I/O routine or the
+ *	initiator to leave B_INVAL set to brelse the buffer out of existance
+ *	in the biodone routine.
  */
 void
 biodone(register struct buf * bp)
@@ -2295,7 +2468,17 @@ biodone(register struct buf * bp)
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
-		iosize = bp->b_bufsize;
+
+		/*
+		 * Set B_CACHE if the op was a normal read and no error
+		 * occured.  B_CACHE is set for writes in the b*write()
+		 * routines.
+		 */
+		iosize = bp->b_bcount;
+		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
+			bp->b_flags |= B_CACHE;
+		}
+
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			m = bp->b_pages[i];
@@ -2307,6 +2490,7 @@ biodone(register struct buf * bp)
 					printf("biodone: page disappeared\n");
 #endif
 					vm_object_pip_subtract(obj, 1);
+					bp->b_flags &= ~B_CACHE;
 					continue;
 				}
 				bp->b_pages[i] = m;
@@ -2325,8 +2509,8 @@ biodone(register struct buf * bp)
 
 			/*
 			 * In the write case, the valid and clean bits are
-			 * already changed correctly, so we only need to do this
-			 * here in the read case.
+			 * already changed correctly ( see bdwrite() ), so we 
+			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 				vfs_page_set_valid(bp, foff, i, m);
@@ -2453,106 +2637,45 @@ vfs_unbusy_pages(struct buf * bp)
 }
 
 /*
- * Set NFS' b_validoff and b_validend fields from the valid bits
- * of a page.  If the consumer is not NFS, and the page is not
- * valid for the entire range, clear the B_CACHE flag to force
- * the consumer to re-read the page.
+ * vfs_page_set_valid:
  *
- * B_CACHE interaction is especially tricky.
- */
-static void
-vfs_buf_set_valid(struct buf *bp,
-		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
-		  vm_page_t m)
-{
-	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
-		vm_offset_t svalid, evalid;
-		int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
-
-		/*
-		 * This only bothers with the first valid range in the
-		 * page.
-		 */
-		svalid = off;
-		while (validbits && !(validbits & 1)) {
-			svalid += DEV_BSIZE;
-			validbits >>= 1;
-		}
-		evalid = svalid;
-		while (validbits & 1) {
-			evalid += DEV_BSIZE;
-			validbits >>= 1;
-		}
-		evalid = min(evalid, off + size);
-		/*
-		 * We can only set b_validoff/end if this range is contiguous
-		 * with the range built up already.  If we cannot set
-		 * b_validoff/end, we must clear B_CACHE to force an update
-		 * to clean the bp up.
-		 */
-		if (svalid == bp->b_validend) {
-			bp->b_validoff = min(bp->b_validoff, svalid);
-			bp->b_validend = max(bp->b_validend, evalid);
-		} else {
-			bp->b_flags &= ~B_CACHE;
-		}
-	} else if (!vm_page_is_valid(m,
-				     (vm_offset_t) ((foff + off) & PAGE_MASK),
-				     size)) {
-		bp->b_flags &= ~B_CACHE;
-	}
-}
-
-/*
- * Set the valid bits in a page, taking care of the b_validoff,
- * b_validend fields which NFS uses to optimise small reads.  Off is
- * the offset within the file and pageno is the page index within the buf.
+ *	Set the valid bits in a page based on the supplied offset.   The
+ *	range is restricted to the buffer's size.
  *
- * XXX we have to set the valid & clean bits for all page fragments 
- * touched by b_validoff/validend, even if the page fragment goes somewhat
- * beyond b_validoff/validend due to alignment.
+ *	For NFS, the range is additionally restricted to b_validoff/end.
+ *	validoff/end must be DEV_BSIZE chunky or the end must be at the 
+ *	file EOF.  If a dirty range exists, set the page's dirty bits
+ *	inclusively.
+ *
+ *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 {
-	struct vnode *vp = bp->b_vp;
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
-	 * page boundry or cross the end of the buffer.
+	 * page boundry or cross the end of the buffer.  The end of the
+	 * buffer, in this case, is our file EOF, not the allocation size
+	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
-	if (eoff > bp->b_offset + bp->b_bufsize)
-		eoff = bp->b_offset + bp->b_bufsize;
-
-	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
-		vm_ooffset_t sv, ev;
-		vm_page_set_invalid(m,
-		    (vm_offset_t) (soff & PAGE_MASK),
-		    (vm_offset_t) (eoff - soff));
-		/*
-		 * bp->b_validoff and bp->b_validend restrict the valid range
-		 * that we can set.  Note that these offsets are not DEV_BSIZE
-		 * aligned.  vm_page_set_validclean() must know what 
-		 * sub-DEV_BSIZE ranges to clear.
-		 */
-#if 0
-		sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
-		ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) & 
-		    ~(DEV_BSIZE - 1);
-#endif
-		sv = bp->b_offset + bp->b_validoff;
-		ev = bp->b_offset + bp->b_validend;
-		soff = qmax(sv, soff);
-		eoff = qmin(ev, eoff);
-	}
+	if (eoff > bp->b_offset + bp->b_bcount)
+		eoff = bp->b_offset + bp->b_bcount;
 
-	if (eoff > soff)
-		vm_page_set_validclean(m,
-	       (vm_offset_t) (soff & PAGE_MASK),
-	       (vm_offset_t) (eoff - soff));
+	/*
+	 * Set valid range.  This is typically the entire buffer and thus the
+	 * entire page.
+	 */
+	if (eoff > soff) {
+		vm_page_set_validclean(
+		    m,
+		   (vm_offset_t) (soff & PAGE_MASK),
+		   (vm_offset_t) (eoff - soff)
+		);
+	}
 }
 
 /*
@@ -2562,6 +2685,10 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
  * almost as being PG_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as B_ERROR or B_INVAL may be in an inconsistant state
+ * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf * bp, int clear_modify)
@@ -2595,6 +2722,22 @@ retry:
 				vm_page_io_start(m);
 			}
 
+			/*
+			 * When readying a buffer for a read ( i.e
+			 * clear_modify == 0 ), it is important to do
+			 * bogus_page replacement for valid pages in 
+			 * partially instantiated buffers.  Partially 
+			 * instantiated buffers can, in turn, occur when
+			 * reconstituting a buffer from its VM backing store
+			 * base.  We only have to do this if B_CACHE is
+			 * clear ( which causes the I/O to occur in the
+			 * first place ).  The replacement prevents the read
+			 * I/O from overwriting potentially dirty VM-backed
+			 * pages.  XXX bogus page replacement is, uh, bogus.
+			 * It may not work properly with small-block devices.
+			 * We need to find a better way.
+			 */
+
 			vm_page_protect(m, VM_PROT_NONE);
 			if (clear_modify)
 				vfs_page_set_valid(bp, foff, i, m);
@@ -2614,30 +2757,89 @@ retry:
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
  */
-void
+static void
 vfs_clean_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
+
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
+			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			vm_ooffset_t eoff = noff;
+
+			if (eoff > bp->b_offset + bp->b_bufsize)
+				eoff = bp->b_offset + bp->b_bufsize;
 			vfs_page_set_valid(bp, foff, i, m);
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+			foff = noff;
 		}
 	}
 }
 
+/*
+ *	vfs_bio_set_validclean:
+ *
+ *	Set the range within the buffer to valid and clean.  The range is 
+ *	relative to the beginning of the buffer, b_offset.  Note that b_offset
+ *	itself may be offset from the beginning of the first page.
+ */
+
+void   
+vfs_bio_set_validclean(struct buf *bp, int base, int size)
+{
+	if (bp->b_flags & B_VMIO) {
+		int i;
+		int n;
+
+		/*
+		 * Fixup base to be relative to beginning of first page.
+		 * Set initial n to be the maximum number of bytes in the
+		 * first page that can be validated.
+		 */
+
+		base += (bp->b_offset & PAGE_MASK);
+		n = PAGE_SIZE - (base & PAGE_MASK);
+
+		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+			vm_page_t m = bp->b_pages[i];
+
+			if (n > size)
+				n = size;
+
+			vm_page_set_validclean(m, base & PAGE_MASK, n);
+			base += n;
+			size -= n;
+			n = PAGE_SIZE;
+		}
+	}
+}
+
+/*
+ *	vfs_bio_clrbuf:
+ *
+ *	clear a buffer.  This routine essentially fakes an I/O, so we need
+ *	to clear B_ERROR and B_INVAL.
+ *
+ *	Note that while we only theoretically need to clear through b_bcount,
+ *	we go ahead and clear through b_bufsize.
+ */
+
 void
 vfs_bio_clrbuf(struct buf *bp) {
 	int i, mask = 0;
 	caddr_t sa, ea;
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+		bp->b_flags &= ~(B_INVAL|B_ERROR);
 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 		    (bp->b_offset & PAGE_MASK) == 0) {
 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index f7bd95e..5f7f870 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $
+ * $Id: vfs_cluster.c,v 1.80 1999/03/12 02:24:56 julian Exp $
  */
 
 #include "opt_debug_cluster.h"
@@ -251,6 +251,7 @@ single_block_read:
 #endif
 		if ((bp->b_flags & B_CLUSTER) == 0)
 			vfs_busy_pages(bp, 0);
+		bp->b_flags &= ~(B_ERROR|B_INVAL);
 		error = VOP_STRATEGY(vp, bp);
 		curproc->p_stats->p_ru.ru_inblock++;
 	}
@@ -283,6 +284,7 @@ single_block_read:
 
 			if ((rbp->b_flags & B_CLUSTER) == 0)
 				vfs_busy_pages(rbp, 0);
+			rbp->b_flags &= ~(B_ERROR|B_INVAL);
 			(void) VOP_STRATEGY(vp, rbp);
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
@@ -473,8 +475,10 @@ cluster_callback(bp)
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;
-		} else
-		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+		} else {
+			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+			tbp->b_flags &= ~(B_ERROR|B_INVAL);
+		}
 		biodone(tbp);
 	}
 	relpbuf(bp, &cluster_pbuf_freecnt);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index c0565a4..de5d18d 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -138,6 +138,18 @@ vop_panic(struct vop_generic_args *ap)
 	panic("illegal vnode op called");
 }
 
+/*
+ *	vop_nostrategy:
+ *
+ *	Strategy routine for VFS devices that have none.
+ *
+ *	B_ERROR and B_INVAL must be cleared prior to calling any strategy
+ *	routine.  Typically this is done for a B_READ strategy call.  Typically
+ *	B_INVAL is assumed to already be clear prior to a write and should not
+ *	be cleared manually unless you just made the buffer invalid.  B_ERROR
+ *	should be cleared either way.
+ */
+
 static int
 vop_nostrategy (struct vop_strategy_args *ap)
 {
diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfs/nfs.h
+++ b/sys/nfs/nfs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index cef982b..0d8a782 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
  */
 
 
@@ -65,7 +65,6 @@
 
 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
 extern int nfs_pbuf_freecnt;
@@ -84,7 +83,7 @@ nfs_getpages(ap)
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
-	int i, error, nextoff, size, toff, npages, count;
+	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
@@ -110,13 +109,35 @@ nfs_getpages(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	npages = btoc(count);
+
+	/*
+	 * If the requested page is partially valid, just return it and
+	 * allow the pager to zero-out the blanks.  Partially valid pages
+	 * can only occur at the file EOF.
+	 */
+
+	{
+		vm_page_t m = pages[ap->a_reqpage];
+
+		if (m->valid != 0) {
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
+			for (i = 0; i < npages; ++i) {
+				if (i != ap->a_reqpage)
+					vnode_pager_freepage(pages[i]);
+			}
+			return(0);
+		}
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
-	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 
@@ -167,12 +188,12 @@ nfs_getpages(ap)
 			m->dirty = 0;
 		} else if (size > toff) {
 			/*
-			 * Read operation filled a partial page, set valid
-			 * bits properly.  validclean will zero out
-			 * any cruft in the buffer when setting a valid bit,
-			 * if the size is not DEV_BSIZE aligned.
+			 * Read operation filled a partial page.
 			 */
+			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
 		}
 		
 		if (i != ap->a_reqpage) {
@@ -197,13 +218,6 @@ nfs_getpages(ap)
 			} else {
 				vnode_pager_freepage(m);
 			}
-		} else {
-			/*
-			 * This page is being mapped, clear out any other
-			 * cruft in the invalid areas of the page.
-			 */
-			if (m->valid && m->valid != VM_PAGE_BITS_ALL)
-				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	return 0;
@@ -228,14 +242,17 @@ nfs_putpages(ap)
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
+	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct nfsmount *nmp;
+	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
+	np = VTONFS(vp);
 	p = curproc;				/* XXX */
 	cred = curproc->p_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
@@ -243,6 +260,7 @@ nfs_putpages(ap)
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
+	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@@ -253,6 +271,16 @@ nfs_putpages(ap)
 	}
 
 	/*
+	 * When putting pages, do not extend file past EOF.
+	 */
+
+	if (offset + count > np->n_size) {
+		count = np->n_size - offset;
+		if (count < 0)
+			count = 0;
+	}
+
+	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
@@ -265,7 +293,7 @@ nfs_putpages(ap)
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
-	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
@@ -297,23 +325,21 @@ nfs_putpages(ap)
  * Vnode op for read using bio
  */
 int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
 	register struct vnode *vp;
 	register struct uio *uio;
 	int ioflag;
 	struct ucred *cred;
-	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register int biosize, i;
-	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
-	int bufsize;
-	int nra, error = 0, n = 0, on = 0, not_readin;
+	int bcount;
+	int nra, error = 0, n = 0, on = 0;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
-		not_readin = 1;
 
 		/*
 		 * Start the read ahead(s), as required.
@@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 				return (EINTR);
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		}
 
 		/*
-		 * If the block is in the cache and has the required data
-		 * in a valid region, just copy it out.
-		 * Otherwise, get the block and write back/read in,
-		 * as required.
+		 * Obtain the buffer cache block.  Figure out the buffer size
+		 * when we are at EOF.  nfs_getcacheblk() will also force
+		 * uncached delayed-writes to be flushed to the server.
+		 *
+		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
-again:
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size && 
-		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		bcount = biosize;
+		if ((off_t)lbn * biosize >= np->n_size) {
+			bcount = 0;
+		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+			bcount = np->n_size - (off_t)lbn * biosize;
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+		bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		if (!bp)
 			return (EINTR);
 
 		/*
-		 * If we are being called from nfs_getpages, we must
-		 * make sure the buffer is a vmio buffer.  The vp will
-		 * already be setup for vmio but there may be some old
-		 * non-vmio buffers attached to it.
+		 * If B_CACHE is not set, we must issue the read.  If this
+		 * fails, we return an error.
 		 */
-		if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
-			printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
-			bp->b_flags |= B_NOCACHE;
-			bp->b_flags |= B_INVAFTERWRITE;
-			if (bp->b_dirtyend > 0) {
-				if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-			} else
-				brelse(bp);
-			goto again;
-		}
+
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
-		    not_readin = 0;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -501,32 +509,20 @@ again:
 			return (error);
 		    }
 		}
-		if (bufsize > on) {
-			n = min((unsigned)(bufsize - on), uio->uio_resid);
-		} else {
-			n = 0;
-		}
-		diff = np->n_size - uio->uio_offset;
-		if (diff < n)
-			n = diff;
-		if (not_readin && n > 0) {
-			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
-				bp->b_flags |= B_NOCACHE;
-				bp->b_flags |= B_INVAFTERWRITE;
-				if (bp->b_dirtyend > 0) {
-				    if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				    if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-				} else
-				    brelse(bp);
-				goto again;
-			}
-		}
+
+		/*
+		 * on is the offset into the current bp.  Figure out how many
+		 * bytes we can copy out of the bp.  Note that bcount is
+		 * NOT DEV_BSIZE aligned.
+		 *
+		 * Then figure out how many bytes we can copy into the uio.
+		 */
+
+		n = 0;
+		if (on < bcount)
+			n = min((unsigned)(bcount - on), uio->uio_resid);
+
 		vp->v_lastr = lbn;
-		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
-		if (diff < n)
-			n = diff;
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
@@ -535,7 +531,6 @@ again:
 			return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -560,13 +555,13 @@ again:
 		    return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
+			printf("got bad cookie vp %p bp %p\n", vp, bp);
 			nfs_invaldir(vp);
 			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 			/*
@@ -574,6 +569,10 @@ again:
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
+			 *
+			 * Leave the last bp intact unless there is an error.
+			 * Loop back up to the while if the error is another
+			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
@@ -582,21 +581,32 @@ again:
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
 			    if (!bp)
 				return (EINTR);
-			    if ((bp->b_flags & B_DONE) == 0) {
-				bp->b_flags |= B_READ;
-				bp->b_flags &= ~B_DONE;
-				vfs_busy_pages(bp, 0);
-				error = nfs_doio(bp, cred, p);
-				if (error == 0 && (bp->b_flags & B_INVAL))
-					break;
-				if (error) {
-				    brelse(bp);
-				} else if (i < lbn) {
-				    brelse(bp);
-				}
+			    if ((bp->b_flags & B_CACHE) == 0) {
+				    bp->b_flags |= B_READ;
+				    vfs_busy_pages(bp, 0);
+				    error = nfs_doio(bp, cred, p);
+				    /*
+				     * no error + B_INVAL == directory EOF,
+				     * use the block.
+				     */
+				    if (error == 0 && (bp->b_flags & B_INVAL))
+					    break;
 			    }
+			    /*
+			     * An error will throw away the block and the
+			     * for loop will break out.  If no error and this
+			     * is not the block we want, we throw away the
+			     * block and go for the next one via the for loop.
+			     */
+			    if (error || i < lbn)
+				    brelse(bp);
 			}
 		    }
+		    /*
+		     * The above while is repeated if we hit another cookie
+		     * error.  If we hit an error and it wasn't a cookie error,
+		     * we give up.
+		     */
 		    if (error)
 			    return (error);
 		}
@@ -616,7 +626,6 @@ again:
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -629,10 +638,20 @@ again:
 			}
 		}
 		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
+		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+		 * chopped for the EOF condition, we cannot tell how large
+		 * NFS directories are going to be until we hit EOF.  So
+		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
+		 * it just so happens that b_resid will effectively chop it
+		 * to EOF.  *BUT* this information is lost if the buffer goes
+		 * away and is reconstituted into a B_CACHE state ( due to
+		 * being VMIO ) later.  So we keep track of the directory eof
+		 * in np->n_direofoffset and chop it off as an extra step 
+		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@@ -649,6 +668,10 @@ again:
 		n = 0;
 		break;
 	    case VDIR:
+		/*
+		 * Invalidate buffer if caching is disabled, forcing a
+		 * re-read from the remote later.
+		 */
 		if (np->n_flag & NQNFSNONCACHE)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -660,24 +683,6 @@ again:
 	return (error);
 }
 
-static void
-nfs_prot_buf(bp, off, n)
-	struct buf *bp;
-	int off;
-	int n;
-{
-	int pindex, boff, end;
-
-	if ((bp->b_flags & B_VMIO) == 0)
-		return;
-
-	end = round_page(off + n);
-	for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
-		pindex = boff >> PAGE_SHIFT;
-		vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
-	}
-}
-
 /*
  * Vnode op for write using bio
  */
@@ -690,18 +695,18 @@ nfs_write(ap)
 		struct ucred *a_cred;
 	} */ *ap;
 {
-	register int biosize;
-	register struct uio *uio = ap->a_uio;
+	int biosize;
+	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
-	register struct ucred *cred = ap->a_cred;
+	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
-	int bufsize;
+	int bcount;
 	int n, on, error = 0, iomode, must_commit;
 
 #ifdef DIAGNOSTIC
@@ -749,12 +754,9 @@ nfs_write(ap)
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
-	/*
-	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
-	 * will be the same size within a filesystem. nfs_writerpc will
-	 * still use nm_wsize when sizing the rpc's.
-	 */
+
 	biosize = vp->v_mount->mnt_stat.f_iosize;
+
 	do {
 		/*
 		 * Check for a valid write lease.
@@ -786,17 +788,74 @@ nfs_write(ap)
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
-		if (uio->uio_offset + n > np->n_size) {
+		/*
+		 * Handle direct append and file extension cases, calculate
+		 * unaligned buffer size.
+		 */
+
+		if (uio->uio_offset == np->n_size && n) {
+			/*
+			 * special append case.  Obtain buffer prior to
+			 * resizing it to maintain B_CACHE.
+			 */
+			long save;
+
+			bcount = on;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+			save = bp->b_flags & B_CACHE;
+
 			np->n_size = uio->uio_offset + n;
 			np->n_flag |= NMODIFIED;
 			vnode_pager_setsize(vp, np->n_size);
+
+			bcount += n;
+			allocbuf(bp, bcount);
+			bp->b_flags |= save;
+		} else {
+			if (uio->uio_offset + n > np->n_size) {
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
+			}
+			bcount = biosize;
+			if ((off_t)(lbn + 1) * biosize > np->n_size) 
+				bcount = np->n_size - (off_t)lbn * biosize;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+		}
+
+		/*
+		 * Issue a READ if B_CACHE is not set.  In special-append
+		 * mode, B_CACHE is based on the buffer prior to the write
+		 * op and is typically set, avoiding the read.  If a read
+		 * is required in special append mode, the server will
+		 * probably send us a short-read since we extended the file
+		 * on our end, resulting in b_resid == 0 and, thusly, 
+		 * B_CACHE getting set.
+		 *
+		 * We can also avoid issuing the read if the write covers
+		 * the entire buffer.  We have to make sure the buffer state
+		 * is reasonable in this case since we will not be initiating
+		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
+		 * more information.
+		 *
+		 * B_CACHE may also be set due to the buffer being cached
+		 * normally.
+		 */
+
+		if (on == 0 && n == bcount) {
+			bp->b_flags |= B_CACHE;
+			bp->b_flags &= ~(B_ERROR | B_INVAL);
 		}
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		if ((bp->b_flags & B_CACHE) == 0) {
+			bp->b_flags |= B_READ;
+			vfs_busy_pages(bp, 0);
+			error = nfs_doio(bp, cred, p);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
 		if (bp->b_wcred == NOCRED) {
@@ -820,6 +879,17 @@ again:
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
+		 *
+		 * While it is possible to merge discontiguous writes due to 
+		 * our having a B_CACHE buffer ( and thus valid read data
+		 * for the hole), we don't because it could lead to 
+		 * significant cache coherency problems with multiple clients,
+		 * especially if locking is implemented later on.
+		 *
+		 * as an optimization we could theoretically maintain
+		 * a linked list of discontinuous areas, but we would still
+		 * have to commit them separately so there isn't much
+		 * advantage to it except perhaps a bit of asynchronization.
 		 */
 
 		if (bp->b_dirtyend > 0 &&
@@ -863,11 +933,6 @@ again:
 		}
 
 		/*
-		 * This will keep the buffer and mmaped regions more coherent.
-		 */
-		nfs_prot_buf(bp, on, n);
-
-		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate 
 		 * condition.
 		 */
@@ -879,21 +944,7 @@ again:
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
-		}
-
-		/*
-		 * To avoid code complexity, we may have to throw away
-		 * previously valid ranges when merging the new dirty range
-		 * into the valid range.  As long as we do not *ADD* an
-		 * invalid valid range, we are ok.
-		 */
-		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
-		    bp->b_validoff > bp->b_dirtyend) {
-			bp->b_validoff = bp->b_dirtyoff;
-			bp->b_validend = bp->b_dirtyend;
-		} else {
-			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+			vfs_bio_set_validclean(bp, on, n);
 		}
 
 		/*
@@ -904,11 +955,14 @@ again:
 
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
+		 *
+		 * IO_INVAL appears to be unused.  The idea appears to be
+		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 			bp->b_proc = p;
 			if (ioflag & IO_INVAL)
-				bp->b_flags |= B_INVAL;
+				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp);
 			if (error)
 				return (error);
@@ -922,8 +976,9 @@ again:
 			bp->b_proc = (struct proc *)0;
 			bp->b_flags |= B_ASYNC;
 			(void)nfs_writebp(bp, 0);
-		} else
+		} else {
 			bdwrite(bp);
+		}
 	} while (uio->uio_resid > 0 && n > 0);
 	return (0);
 }
@@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
 				return ((struct buf *)0);
 			bp = getblk(vp, bn, size, 0, 2 * hz);
 		}
-	} else
+	} else {
 		bp = getblk(vp, bn, size, 0, 0);
+	}
 
 	if (vp->v_type == VREG) {
 		int biosize;
+
 		biosize = mp->mnt_stat.f_iosize;
 		bp->b_blkno = bn * (biosize / DEV_BSIZE);
 	}
-
 	return (bp);
 }
 
@@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
  */
 int
 nfs_asyncio(bp, cred)
@@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
 	struct vnode *vp;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
-	int error = 0, diff, len, iomode, must_commit = 0;
+	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 
@@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_procp = p;
 
+	/*
+	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
+	 * do this here so we do not have to do it in all the code that
+	 * calls us.
+	 */
+	bp->b_flags &= ~(B_ERROR | B_INVAL);
+
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
 
 	/*
@@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
 		if (!error) {
-		    bp->b_validoff = 0;
 		    if (uiop->uio_resid) {
 			/*
-			 * If len > 0, there is a hole in the file and
-			 * no writes after the hole have been pushed to
-			 * the server yet.
-			 * Just zero fill the rest of the valid area.
+			 * If we had a short read with no error, we must have
+			 * hit a file hole.  We should zero-fill the remainder.
+			 * This can also occur if the server hits the file EOF.
+			 *
+			 * Holes used to be able to occur due to pending 
+			 * writes, but that is not possible any longer.
 			 */
-			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
-				+ diff);
-			if (len > 0) {
-			    len = min(len, uiop->uio_resid);
-			    bzero((char *)bp->b_data + diff, len);
-			    bp->b_validend = diff + len;
-			} else
-			    bp->b_validend = diff;
-		    } else
-			bp->b_validend = bp->b_bcount;
+			int nread = bp->b_bcount - uiop->uio_resid;
+			int left  = bp->b_bcount - nread;
+
+			if (left > 0)
+				bzero((char *)bp->b_data + nread, left);
+			uiop->uio_resid = 0;
+		    }
 		}
 		if (p && (vp->v_flag & VTEXT) &&
 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
@@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
+		/*
+		 * end-of-directory sets B_INVAL but does not generate an
+		 * error.
+		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
-			&& bp->b_dirtyend == bp->b_bufsize)
+			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~B_NEEDCOMMIT;
diff --git a/sys/nfs/nfs_nqlease.c b/sys/nfs/nfs_nqlease.c
index 71f692a..e45c73f 100644
--- a/sys/nfs/nfs_nqlease.c
+++ b/sys/nfs/nfs_nqlease.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_nqlease.c	8.9 (Berkeley) 5/20/95
- * $Id: nfs_nqlease.c,v 1.39 1998/10/31 15:31:25 peter Exp $
+ * $Id: nfs_nqlease.c,v 1.40 1999/02/25 00:03:51 peter Exp $
  */
 
 
@@ -561,6 +561,10 @@ nqsrv_send_eviction(vp, lp, slp, nam, cred)
 				*mtod(m, u_int32_t *) = htonl(0x80000000 |
 					(m->m_pkthdr.len - NFSX_UNSIGNED));
 			}
+			/*
+			 * nfs_sndlock if PR_CONNREQUIRED XXX
+			 */
+
 			if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 &&
 			    (lph->lph_slp->ns_flag & SLP_VALID) == 0) ||
 			    (nfs_slplock(lph->lph_slp, 0) == 0))
diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c
index 1490f72..2267629 100644
--- a/sys/nfs/nfs_socket.c
+++ b/sys/nfs/nfs_socket.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
  */
 
 /*
@@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };
 
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
  * There is a congestion window for outstanding rpcs maintained per mount
  * point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }
 
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }
 
 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
  */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;
 
-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
-		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+	++nfs_realign_test;
 
-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
-		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
-			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
 			}
+			n->m_len = 0;
+			break;
 		}
+		pm = &m->m_next;
+	}
 
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
+		while (m) {
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
+			m = m->m_next;
 		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }
 
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index a92bb22..6114d56 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
  */
 
 
@@ -408,9 +408,9 @@ nfs_access(ap)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
-				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+				bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
-				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+				aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
@@ -962,7 +962,7 @@ nfs_read(ap)
 
 	if (vp->v_type != VREG)
 		return (EPERM);
-	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 }
 
 /*
@@ -980,7 +980,7 @@ nfs_readlink(ap)
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
-	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
@@ -1985,7 +1985,7 @@ nfs_readdir(ap)
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
-	error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid)
 		nfsstats.direofcache_misses++;
@@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)
 
 {
 	register int len, left;
-	register struct dirent *dp;
+	register struct dirent *dp = NULL;
 	register u_int32_t *tl;
 	register caddr_t cp;
 	register int32_t t1, t2;
@@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
 	int attrflag;
 	int v3 = NFS_ISV3(vp);
 
-#ifndef nolint
-	dp = (struct dirent *)0;
-#endif
 #ifndef DIAGNOSTIC
-	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
-		(uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif
 
@@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
 		m_freem(mrep);
 	}
 	/*
-	 * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
@@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
 		struct vnode *a_bp;
 	} */ *ap;
 {
-
 	return (nfs_writebp(ap->a_bp, 1));
 }
 
 /*
  * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag.  We set
+ * B_CACHE if this is a VMIO buffer.
  */
 int
 nfs_writebp(bp, force)
@@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
 	if(!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");
 
-	if (bp->b_flags & B_INVAL)
-		bp->b_flags |= B_NOCACHE;
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return(0);
+	}
+
+	bp->b_flags |= B_CACHE;
 
 	/*
-	 * XXX we bundirty() the bp here.  Shouldn't we do it later after
-	 * the I/O has completed??
+	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index cef982b..0d8a782 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.68 1999/04/05 19:38:28 julian Exp $
+ * $Id: nfs_bio.c,v 1.69 1999/04/06 03:07:54 peter Exp $
  */
 
 
@@ -65,7 +65,6 @@
 
 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p));
-static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
 extern int nfs_pbuf_freecnt;
@@ -84,7 +83,7 @@ nfs_getpages(ap)
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
-	int i, error, nextoff, size, toff, npages, count;
+	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
@@ -110,13 +109,35 @@ nfs_getpages(ap)
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
+
+	npages = btoc(count);
+
+	/*
+	 * If the requested page is partially valid, just return it and
+	 * allow the pager to zero-out the blanks.  Partially valid pages
+	 * can only occur at the file EOF.
+	 */
+
+	{
+		vm_page_t m = pages[ap->a_reqpage];
+
+		if (m->valid != 0) {
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
+			for (i = 0; i < npages; ++i) {
+				if (i != ap->a_reqpage)
+					vnode_pager_freepage(pages[i]);
+			}
+			return(0);
+		}
+	}
+
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
-	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 
@@ -167,12 +188,12 @@ nfs_getpages(ap)
 			m->dirty = 0;
 		} else if (size > toff) {
 			/*
-			 * Read operation filled a partial page, set valid
-			 * bits properly.  validclean will zero out
-			 * any cruft in the buffer when setting a valid bit,
-			 * if the size is not DEV_BSIZE aligned.
+			 * Read operation filled a partial page.
 			 */
+			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
+			/* handled by vm_fault now	  */
+			/* vm_page_zero_invalid(m, TRUE); */
 		}
 		
 		if (i != ap->a_reqpage) {
@@ -197,13 +218,6 @@ nfs_getpages(ap)
 			} else {
 				vnode_pager_freepage(m);
 			}
-		} else {
-			/*
-			 * This page is being mapped, clear out any other
-			 * cruft in the invalid areas of the page.
-			 */
-			if (m->valid && m->valid != VM_PAGE_BITS_ALL)
-				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	return 0;
@@ -228,14 +242,17 @@ nfs_putpages(ap)
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
+	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct nfsmount *nmp;
+	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
+	np = VTONFS(vp);
 	p = curproc;				/* XXX */
 	cred = curproc->p_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
@@ -243,6 +260,7 @@ nfs_putpages(ap)
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
+	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
@@ -253,6 +271,16 @@ nfs_putpages(ap)
 	}
 
 	/*
+	 * When putting pages, do not extend file past EOF.
+	 */
+
+	if (offset + count > np->n_size) {
+		count = np->n_size - offset;
+		if (count < 0)
+			count = 0;
+	}
+
+	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
@@ -265,7 +293,7 @@ nfs_putpages(ap)
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
-	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
+	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
@@ -297,23 +325,21 @@ nfs_putpages(ap)
  * Vnode op for read using bio
  */
 int
-nfs_bioread(vp, uio, ioflag, cred, getpages)
+nfs_bioread(vp, uio, ioflag, cred)
 	register struct vnode *vp;
 	register struct uio *uio;
 	int ioflag;
 	struct ucred *cred;
-	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register int biosize, i;
-	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
-	int bufsize;
-	int nra, error = 0, n = 0, on = 0, not_readin;
+	int bcount;
+	int nra, error = 0, n = 0, on = 0;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@@ -424,7 +450,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
-		not_readin = 1;
 
 		/*
 		 * Start the read ahead(s), as required.
@@ -439,7 +464,6 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 				return (EINTR);
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -453,47 +477,31 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		}
 
 		/*
-		 * If the block is in the cache and has the required data
-		 * in a valid region, just copy it out.
-		 * Otherwise, get the block and write back/read in,
-		 * as required.
+		 * Obtain the buffer cache block.  Figure out the buffer size
+		 * when we are at EOF.  nfs_getcacheblk() will also force
+		 * uncached delayed-writes to be flushed to the server.
+		 *
+		 * Note that bcount is *not* DEV_BSIZE aligned.
 		 */
-again:
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size && 
-		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		bcount = biosize;
+		if ((off_t)lbn * biosize >= np->n_size) {
+			bcount = 0;
+		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
+			bcount = np->n_size - (off_t)lbn * biosize;
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
+
+		bp = nfs_getcacheblk(vp, lbn, bcount, p);
 		if (!bp)
 			return (EINTR);
 
 		/*
-		 * If we are being called from nfs_getpages, we must
-		 * make sure the buffer is a vmio buffer.  The vp will
-		 * already be setup for vmio but there may be some old
-		 * non-vmio buffers attached to it.
+		 * If B_CACHE is not set, we must issue the read.  If this
+		 * fails, we return an error.
 		 */
-		if (getpages && !(bp->b_flags & B_VMIO)) {
-#ifdef DIAGNOSTIC
-			printf("nfs_bioread: non vmio buf found, discarding\n");
-#endif
-			bp->b_flags |= B_NOCACHE;
-			bp->b_flags |= B_INVAFTERWRITE;
-			if (bp->b_dirtyend > 0) {
-				if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-			} else
-				brelse(bp);
-			goto again;
-		}
+
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
-		    not_readin = 0;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -501,32 +509,20 @@ again:
 			return (error);
 		    }
 		}
-		if (bufsize > on) {
-			n = min((unsigned)(bufsize - on), uio->uio_resid);
-		} else {
-			n = 0;
-		}
-		diff = np->n_size - uio->uio_offset;
-		if (diff < n)
-			n = diff;
-		if (not_readin && n > 0) {
-			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
-				bp->b_flags |= B_NOCACHE;
-				bp->b_flags |= B_INVAFTERWRITE;
-				if (bp->b_dirtyend > 0) {
-				    if ((bp->b_flags & B_DELWRI) == 0)
-					panic("nfsbioread");
-				    if (VOP_BWRITE(bp) == EINTR)
-					return (EINTR);
-				} else
-				    brelse(bp);
-				goto again;
-			}
-		}
+
+		/*
+		 * on is the offset into the current bp.  Figure out how many
+		 * bytes we can copy out of the bp.  Note that bcount is
+		 * NOT DEV_BSIZE aligned.
+		 *
+		 * Then figure out how many bytes we can copy into the uio.
+		 */
+
+		n = 0;
+		if (on < bcount)
+			n = min((unsigned)(bcount - on), uio->uio_resid);
+
 		vp->v_lastr = lbn;
-		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
-		if (diff < n)
-			n = diff;
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
@@ -535,7 +531,6 @@ again:
 			return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
@@ -560,13 +555,13 @@ again:
 		    return (EINTR);
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_flags |= B_READ;
-		    bp->b_flags &= ~B_DONE;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
+			printf("got bad cookie vp %p bp %p\n", vp, bp);
 			nfs_invaldir(vp);
 			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 			/*
@@ -574,6 +569,10 @@ again:
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
+			 *
+			 * Leave the last bp intact unless there is an error.
+			 * Loop back up to the while if the error is another
+			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
@@ -582,21 +581,32 @@ again:
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
 			    if (!bp)
 				return (EINTR);
-			    if ((bp->b_flags & B_DONE) == 0) {
-				bp->b_flags |= B_READ;
-				bp->b_flags &= ~B_DONE;
-				vfs_busy_pages(bp, 0);
-				error = nfs_doio(bp, cred, p);
-				if (error == 0 && (bp->b_flags & B_INVAL))
-					break;
-				if (error) {
-				    brelse(bp);
-				} else if (i < lbn) {
-				    brelse(bp);
-				}
+			    if ((bp->b_flags & B_CACHE) == 0) {
+				    bp->b_flags |= B_READ;
+				    vfs_busy_pages(bp, 0);
+				    error = nfs_doio(bp, cred, p);
+				    /*
+				     * no error + B_INVAL == directory EOF,
+				     * use the block.
+				     */
+				    if (error == 0 && (bp->b_flags & B_INVAL))
+					    break;
 			    }
+			    /*
+			     * An error will throw away the block and the
+			     * for loop will break out.  If no error and this
+			     * is not the block we want, we throw away the
+			     * block and go for the next one via the for loop.
+			     */
+			    if (error || i < lbn)
+				    brelse(bp);
 			}
 		    }
+		    /*
+		     * The above while is repeated if we hit another cookie
+		     * error.  If we hit an error and it wasn't a cookie error,
+		     * we give up.
+		     */
 		    if (error)
 			    return (error);
 		}
@@ -616,7 +626,6 @@ again:
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= (B_READ | B_ASYNC);
-				rabp->b_flags &= ~B_DONE;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(rabp, cred)) {
 				    rabp->b_flags |= B_INVAL|B_ERROR;
@@ -629,10 +638,20 @@ again:
 			}
 		}
 		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
+		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+		 * chopped for the EOF condition, we cannot tell how large
+		 * NFS directories are going to be until we hit EOF.  So
+		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
+		 * it just so happens that b_resid will effectively chop it
+		 * to EOF.  *BUT* this information is lost if the buffer goes
+		 * away and is reconstituted into a B_CACHE state ( due to
+		 * being VMIO ) later.  So we keep track of the directory eof
+		 * in np->n_direofoffset and chop it off as an extra step 
+		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
+			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@@ -649,6 +668,10 @@ again:
 		n = 0;
 		break;
 	    case VDIR:
+		/*
+		 * Invalidate buffer if caching is disabled, forcing a
+		 * re-read from the remote later.
+		 */
 		if (np->n_flag & NQNFSNONCACHE)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -660,24 +683,6 @@ again:
 	return (error);
 }
 
-static void
-nfs_prot_buf(bp, off, n)
-	struct buf *bp;
-	int off;
-	int n;
-{
-	int pindex, boff, end;
-
-	if ((bp->b_flags & B_VMIO) == 0)
-		return;
-
-	end = round_page(off + n);
-	for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
-		pindex = boff >> PAGE_SHIFT;
-		vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
-	}
-}
-
 /*
  * Vnode op for write using bio
  */
@@ -690,18 +695,18 @@ nfs_write(ap)
 		struct ucred *a_cred;
 	} */ *ap;
 {
-	register int biosize;
-	register struct uio *uio = ap->a_uio;
+	int biosize;
+	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
-	register struct ucred *cred = ap->a_cred;
+	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
-	int bufsize;
+	int bcount;
 	int n, on, error = 0, iomode, must_commit;
 
 #ifdef DIAGNOSTIC
@@ -749,12 +754,9 @@ nfs_write(ap)
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
-	/*
-	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
-	 * will be the same size within a filesystem. nfs_writerpc will
-	 * still use nm_wsize when sizing the rpc's.
-	 */
+
 	biosize = vp->v_mount->mnt_stat.f_iosize;
+
 	do {
 		/*
 		 * Check for a valid write lease.
@@ -786,17 +788,74 @@ nfs_write(ap)
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
-		if (uio->uio_offset + n > np->n_size) {
+		/*
+		 * Handle direct append and file extension cases, calculate
+		 * unaligned buffer size.
+		 */
+
+		if (uio->uio_offset == np->n_size && n) {
+			/*
+			 * special append case.  Obtain buffer prior to
+			 * resizing it to maintain B_CACHE.
+			 */
+			long save;
+
+			bcount = on;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+			save = bp->b_flags & B_CACHE;
+
 			np->n_size = uio->uio_offset + n;
 			np->n_flag |= NMODIFIED;
 			vnode_pager_setsize(vp, np->n_size);
+
+			bcount += n;
+			allocbuf(bp, bcount);
+			bp->b_flags |= save;
+		} else {
+			if (uio->uio_offset + n > np->n_size) {
+				np->n_size = uio->uio_offset + n;
+				np->n_flag |= NMODIFIED;
+				vnode_pager_setsize(vp, np->n_size);
+			}
+			bcount = biosize;
+			if ((off_t)(lbn + 1) * biosize > np->n_size) 
+				bcount = np->n_size - (off_t)lbn * biosize;
+			bp = nfs_getcacheblk(vp, lbn, bcount, p);
+		}
+
+		/*
+		 * Issue a READ if B_CACHE is not set.  In special-append
+		 * mode, B_CACHE is based on the buffer prior to the write
+		 * op and is typically set, avoiding the read.  If a read
+		 * is required in special append mode, the server will
+		 * probably send us a short-read since we extended the file
+		 * on our end, resulting in b_resid == 0 and, thusly, 
+		 * B_CACHE getting set.
+		 *
+		 * We can also avoid issuing the read if the write covers
+		 * the entire buffer.  We have to make sure the buffer state
+		 * is reasonable in this case since we will not be initiating
+		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
+		 * more information.
+		 *
+		 * B_CACHE may also be set due to the buffer being cached
+		 * normally.
+		 */
+
+		if (on == 0 && n == bcount) {
+			bp->b_flags |= B_CACHE;
+			bp->b_flags &= ~(B_ERROR | B_INVAL);
 		}
-		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+
+		if ((bp->b_flags & B_CACHE) == 0) {
+			bp->b_flags |= B_READ;
+			vfs_busy_pages(bp, 0);
+			error = nfs_doio(bp, cred, p);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
 		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
 		if (bp->b_wcred == NOCRED) {
@@ -820,6 +879,17 @@ again:
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
+		 *
+		 * While it is possible to merge discontiguous writes due to 
+		 * our having a B_CACHE buffer ( and thus valid read data
+		 * for the hole), we don't because it could lead to 
+		 * significant cache coherency problems with multiple clients,
+		 * especially if locking is implemented later on.
+		 *
+		 * as an optimization we could theoretically maintain
+		 * a linked list of discontinuous areas, but we would still
+		 * have to commit them separately so there isn't much
+		 * advantage to it except perhaps a bit of asynchronization.
 		 */
 
 		if (bp->b_dirtyend > 0 &&
@@ -863,11 +933,6 @@ again:
 		}
 
 		/*
-		 * This will keep the buffer and mmaped regions more coherent.
-		 */
-		nfs_prot_buf(bp, on, n);
-
-		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate 
 		 * condition.
 		 */
@@ -879,21 +944,7 @@ again:
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
-		}
-
-		/*
-		 * To avoid code complexity, we may have to throw away
-		 * previously valid ranges when merging the new dirty range
-		 * into the valid range.  As long as we do not *ADD* an
-		 * invalid valid range, we are ok.
-		 */
-		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
-		    bp->b_validoff > bp->b_dirtyend) {
-			bp->b_validoff = bp->b_dirtyoff;
-			bp->b_validend = bp->b_dirtyend;
-		} else {
-			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+			vfs_bio_set_validclean(bp, on, n);
 		}
 
 		/*
@@ -904,11 +955,14 @@ again:
 
 		/*
 		 * If the lease is non-cachable or IO_SYNC do bwrite().
+		 *
+		 * IO_INVAL appears to be unused.  The idea appears to be
+		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 			bp->b_proc = p;
 			if (ioflag & IO_INVAL)
-				bp->b_flags |= B_INVAL;
+				bp->b_flags |= B_NOCACHE;
 			error = VOP_BWRITE(bp);
 			if (error)
 				return (error);
@@ -922,8 +976,9 @@ again:
 			bp->b_proc = (struct proc *)0;
 			bp->b_flags |= B_ASYNC;
 			(void)nfs_writebp(bp, 0);
-		} else
+		} else {
 			bdwrite(bp);
+		}
 	} while (uio->uio_resid > 0 && n > 0);
 	return (0);
 }
@@ -956,15 +1011,16 @@ nfs_getcacheblk(vp, bn, size, p)
 				return ((struct buf *)0);
 			bp = getblk(vp, bn, size, 0, 2 * hz);
 		}
-	} else
+	} else {
 		bp = getblk(vp, bn, size, 0, 0);
+	}
 
 	if (vp->v_type == VREG) {
 		int biosize;
+
 		biosize = mp->mnt_stat.f_iosize;
 		bp->b_blkno = bn * (biosize / DEV_BSIZE);
 	}
-
 	return (bp);
 }
 
@@ -1036,6 +1092,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
+ *
+ * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
+ * is eventually dequeued by the async daemon, nfs_doio() *will*.
  */
 int
 nfs_asyncio(bp, cred)
@@ -1164,7 +1223,7 @@ nfs_doio(bp, cr, p)
 	struct vnode *vp;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
-	int error = 0, diff, len, iomode, must_commit = 0;
+	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 
@@ -1177,6 +1236,13 @@ nfs_doio(bp, cr, p)
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_procp = p;
 
+	/*
+	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
+	 * do this here so we do not have to do it in all the code that
+	 * calls us.
+	 */
+	bp->b_flags &= ~(B_ERROR | B_INVAL);
+
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
 
 	/*
@@ -1216,25 +1282,22 @@ nfs_doio(bp, cr, p)
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
 		if (!error) {
-		    bp->b_validoff = 0;
 		    if (uiop->uio_resid) {
 			/*
-			 * If len > 0, there is a hole in the file and
-			 * no writes after the hole have been pushed to
-			 * the server yet.
-			 * Just zero fill the rest of the valid area.
+			 * If we had a short read with no error, we must have
+			 * hit a file hole.  We should zero-fill the remainder.
+			 * This can also occur if the server hits the file EOF.
+			 *
+			 * Holes used to be able to occur due to pending 
+			 * writes, but that is not possible any longer.
 			 */
-			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
-				+ diff);
-			if (len > 0) {
-			    len = min(len, uiop->uio_resid);
-			    bzero((char *)bp->b_data + diff, len);
-			    bp->b_validend = diff + len;
-			} else
-			    bp->b_validend = diff;
-		    } else
-			bp->b_validend = bp->b_bcount;
+			int nread = bp->b_bcount - uiop->uio_resid;
+			int left  = bp->b_bcount - nread;
+
+			if (left > 0)
+				bzero((char *)bp->b_data + nread, left);
+			uiop->uio_resid = 0;
+		    }
 		}
 		if (p && (vp->v_flag & VTEXT) &&
 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
@@ -1262,6 +1325,10 @@ nfs_doio(bp, cr, p)
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
+		/*
+		 * end-of-directory sets B_INVAL but does not generate an
+		 * error.
+		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
@@ -1296,7 +1363,7 @@ nfs_doio(bp, cr, p)
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
-			&& bp->b_dirtyend == bp->b_bufsize)
+			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~B_NEEDCOMMIT;
diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c
index 1490f72..2267629 100644
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
  */
 
 /*
@@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };
 
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
  * There is a congestion window for outstanding rpcs maintained per mount
  * point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }
 
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }
 
 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
  */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;
 
-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
-		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+	++nfs_realign_test;
 
-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
-		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
-			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
 			}
+			n->m_len = 0;
+			break;
 		}
+		pm = &m->m_next;
+	}
 
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
+		while (m) {
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
+			m = m->m_next;
 		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }
 
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index a92bb22..6114d56 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.123 1999/02/16 10:49:54 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.124 1999/03/12 02:24:58 julian Exp $
  */
 
 
@@ -408,9 +408,9 @@ nfs_access(ap)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
-				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
+				bp = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
-				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
+				aiov.iov_len = auio.uio_resid = DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
@@ -962,7 +962,7 @@ nfs_read(ap)
 
 	if (vp->v_type != VREG)
 		return (EPERM);
-	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 }
 
 /*
@@ -980,7 +980,7 @@ nfs_readlink(ap)
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
-	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred, 0));
+	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
@@ -1985,7 +1985,7 @@ nfs_readdir(ap)
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
-	error = nfs_bioread(vp, uio, 0, ap->a_cred, 0);
+	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid)
 		nfsstats.direofcache_misses++;
@@ -2004,7 +2004,7 @@ nfs_readdirrpc(vp, uiop, cred)
 
 {
 	register int len, left;
-	register struct dirent *dp;
+	register struct dirent *dp = NULL;
 	register u_int32_t *tl;
 	register caddr_t cp;
 	register int32_t t1, t2;
@@ -2019,12 +2019,9 @@ nfs_readdirrpc(vp, uiop, cred)
 	int attrflag;
 	int v3 = NFS_ISV3(vp);
 
-#ifndef nolint
-	dp = (struct dirent *)0;
-#endif
 #ifndef DIAGNOSTIC
-	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (NFS_DIRBLKSIZ - 1)) ||
-		(uiop->uio_resid & (NFS_DIRBLKSIZ - 1)))
+	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
+		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif
 
@@ -2381,7 +2378,7 @@ nfs_readdirplusrpc(vp, uiop, cred)
 		m_freem(mrep);
 	}
 	/*
-	 * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ
+	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
@@ -3028,13 +3025,13 @@ nfs_bwrite(ap)
 		struct vnode *a_bp;
 	} */ *ap;
 {
-
 	return (nfs_writebp(ap->a_bp, 1));
 }
 
 /*
  * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.
+ * the force flag is one and it also handles the B_NEEDCOMMIT flag.  We set
+ * B_CACHE if this is a VMIO buffer.
  */
 int
 nfs_writebp(bp, force)
@@ -3049,12 +3046,15 @@ nfs_writebp(bp, force)
 	if(!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");
 
-	if (bp->b_flags & B_INVAL)
-		bp->b_flags |= B_NOCACHE;
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return(0);
+	}
+
+	bp->b_flags |= B_CACHE;
 
 	/*
-	 * XXX we bundirty() the bp here.  Shouldn't we do it later after
-	 * the I/O has completed??
+	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
diff --git a/sys/nfsclient/nfsargs.h b/sys/nfsclient/nfsargs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfsargs.h
+++ b/sys/nfsclient/nfsargs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsclient/nfsstats.h b/sys/nfsclient/nfsstats.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsclient/nfsstats.h
+++ b/sys/nfsclient/nfsstats.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsserver/nfs.h b/sys/nfsserver/nfs.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsserver/nfs.h
+++ b/sys/nfsserver/nfs.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c
index 1490f72..2267629 100644
--- a/sys/nfsserver/nfs_srvsock.c
+++ b/sys/nfsserver/nfs_srvsock.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
- * $Id: nfs_socket.c,v 1.50 1999/02/25 00:03:51 peter Exp $
+ * $Id: nfs_socket.c,v 1.51 1999/04/24 11:29:48 dt Exp $
  */
 
 /*
@@ -54,6 +54,7 @@
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/tprintf.h>
+#include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
@@ -115,6 +116,15 @@ static int proct[NFS_NPROCS] = {
 	0, 0, 0,
 };
 
+static int nfs_realign_test;
+static int nfs_realign_count;
+
+SYSCTL_DECL(_vfs_nfs);
+
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RD, &nfs_realign_test, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RD, &nfs_realign_count, 0, "");
+
+
 /*
  * There is a congestion window for outstanding rpcs maintained per mount
  * point. The cwnd size is adjusted in roughly the way that:
@@ -138,7 +148,7 @@ struct callout_handle	nfs_timer_handle;
 static int	nfs_msg __P((struct proc *,char *,char *));
 static int	nfs_rcvlock __P((struct nfsreq *));
 static void	nfs_rcvunlock __P((struct nfsreq *));
-static void	nfs_realign __P((struct mbuf *m, int hsiz));
+static void	nfs_realign __P((struct mbuf **pm, int hsiz));
 static int	nfs_receive __P((struct nfsreq *rep, struct sockaddr **aname,
 				 struct mbuf **mp));
 static int	nfs_reconnect __P((struct nfsreq *rep));
@@ -702,7 +712,7 @@ errout:
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
-	nfs_realign(*mp, 5 * NFSX_UNSIGNED);
+	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }
 
@@ -1589,92 +1599,56 @@ nfs_rcvunlock(rep)
 }
 
 /*
- * Check for badly aligned mbuf data areas and
- * realign data in an mbuf list by copying the data areas up, as required.
+ *	nfs_realign:
+ *
+ *	Check for badly aligned mbuf data and realign by copying the unaligned
+ *	portion of the data into a new mbuf chain and freeing the portions
+ *	of the old chain that were replaced.
+ *
+ *	We cannot simply realign the data within the existing mbuf chain
+ *	because the underlying buffers may contain other rpc commands and
+ *	we cannot afford to overwrite them.
+ *
+ *	We would prefer to avoid this situation entirely.  The situation does
+ *	not occur with NFS/UDP and is supposed to only occassionally occur
+ *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
  */
 static void
-nfs_realign(m, hsiz)
-	register struct mbuf *m;
+nfs_realign(pm, hsiz)
+	register struct mbuf **pm;
 	int hsiz;
 {
-	register struct mbuf *m2;
-	register int siz, mlen, olen;
-	register caddr_t tcp, fcp;
-	struct mbuf *mnew;
+	struct mbuf *m;
+	struct mbuf *n = NULL;
+	int off = 0;
 
-	while (m) {
-	    /*
-	     * This never happens for UDP, rarely happens for TCP
-	     * but frequently happens for iso transport.
-	     */
-	    if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-		olen = m->m_len;
-		fcp = mtod(m, caddr_t);
-		if ((intptr_t)fcp & 0x3) {
-			m->m_flags &= ~M_PKTHDR;
-			if (m->m_flags & M_EXT)
-				m->m_data = m->m_ext.ext_buf +
-					((m->m_ext.ext_size - olen) & ~0x3);
-			else
-				m->m_data = m->m_dat;
-		}
-		m->m_len = 0;
-		tcp = mtod(m, caddr_t);
-		mnew = m;
-		m2 = m->m_next;
+	++nfs_realign_test;
 
-		/*
-		 * If possible, only put the first invariant part
-		 * of the RPC header in the first mbuf.
-		 */
-		mlen = M_TRAILINGSPACE(m);
-		if (olen <= hsiz && mlen > hsiz)
-			mlen = hsiz;
-
-		/*
-		 * Loop through the mbuf list consolidating data.
-		 */
-		while (m) {
-			while (olen > 0) {
-				if (mlen == 0) {
-					m2->m_flags &= ~M_PKTHDR;
-					if (m2->m_flags & M_EXT)
-						m2->m_data = m2->m_ext.ext_buf;
-					else
-						m2->m_data = m2->m_dat;
-					m2->m_len = 0;
-					mlen = M_TRAILINGSPACE(m2);
-					tcp = mtod(m2, caddr_t);
-					mnew = m2;
-					m2 = m2->m_next;
-				}
-				siz = min(mlen, olen);
-				if (tcp != fcp)
-					bcopy(fcp, tcp, siz);
-				mnew->m_len += siz;
-				mlen -= siz;
-				olen -= siz;
-				tcp += siz;
-				fcp += siz;
-			}
-			m = m->m_next;
-			if (m) {
-				olen = m->m_len;
-				fcp = mtod(m, caddr_t);
+	while ((m = *pm) != NULL) {
+		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
+			MGET(n, M_WAIT, MT_DATA);
+			if (m->m_len >= MINCLSIZE) {
+				MCLGET(n, M_WAIT);
 			}
+			n->m_len = 0;
+			break;
 		}
+		pm = &m->m_next;
+	}
 
-		/*
-		 * Finally, set m_len == 0 for any trailing mbufs that have
-		 * been copied out of.
-		 */
-		while (m2) {
-			m2->m_len = 0;
-			m2 = m2->m_next;
+	/*
+	 * If n is non-NULL, loop on m copying data, then replace the
+	 * portion of the chain that had to be realigned.
+	 */
+	if (n != NULL) {
+		++nfs_realign_count;
+		while (m) {
+			m_copyback(n, off, m->m_len, mtod(m, caddr_t));
+			off += m->m_len;
+			m = m->m_next;
 		}
-		return;
-	    }
-	    m = m->m_next;
+		m_freem(*pm);
+		*pm = n;
 	}
 }
 
@@ -2040,7 +2014,7 @@ nfsrv_rcv(so, arg, waitflag)
 					m_freem(mp);
 					continue;
 				}
-				nfs_realign(mp, 10 * NFSX_UNSIGNED);
+				nfs_realign(&mp, 10 * NFSX_UNSIGNED);
 				rec->nr_address = nam;
 				rec->nr_packet = mp;
 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
@@ -2182,7 +2156,7 @@ nfsrv_getstream(slp, waitflag)
 		if (!rec) {
 		    m_freem(slp->ns_frag);
 		} else {
-		    nfs_realign(slp->ns_frag, 10 * NFSX_UNSIGNED);
+		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
 		    rec->nr_address = (struct sockaddr *)0;
 		    rec->nr_packet = slp->ns_frag;
 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
diff --git a/sys/nfsserver/nfsrvstats.h b/sys/nfsserver/nfsrvstats.h
index bc15a7c..78a54a2 100644
--- a/sys/nfsserver/nfsrvstats.h
+++ b/sys/nfsserver/nfsrvstats.h
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs.h	8.4 (Berkeley) 5/1/95
- * $Id: nfs.h,v 1.44 1998/09/07 05:42:15 bde Exp $
+ * $Id: nfs.h,v 1.45 1999/02/25 00:03:50 peter Exp $
  */
 
 #ifndef _NFS_NFS_H_
@@ -651,8 +651,7 @@ void	nfs_disconnect __P((struct nfsmount *));
 void	nfs_safedisconnect __P((struct nfsmount *));
 int	nfs_getattrcache __P((struct vnode *, struct vattr *));
 int	nfsm_strtmbuf __P((struct mbuf **, char **, const char *, long));
-int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *,
-			 int));
+int	nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *));
 int	nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *));
 void	nfsrv_init __P((int));
 void	nfs_clearcommit __P((struct mount *));
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index d2ce212..2e88ca7 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -78,6 +78,19 @@ struct iodone_chain {
 
 /*
  * The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
+ *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
+ *	originally requested buffer size and can serve as a bounds check
+ *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
+ *
+ *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
+ *	ranges of dirty data that need to be written to backing store.
+ *	The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
+ *	completes, b_resid is usually 0 indicating 100% success.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
@@ -109,8 +122,10 @@ struct buf {
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
+#if 0
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
+#endif
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
@@ -151,9 +166,24 @@ struct buf {
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
- *			The situation where B_DELWRI is set and B_CACHE gets
- *			cleared MUST be committed to disk so B_DELWRI can
- *			also be cleared.
+ *			The situation where B_DELWRI is set and B_CACHE is
+ *			clear MUST be committed to disk by getblk() so 
+ *			B_DELWRI can also be cleared.  See the comments for
+ *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
+ *			the caller is expected to clear B_ERROR|B_INVAL,
+ *			set B_READ, and initiate an I/O.
+ *
+ *			The 'entire buffer' is defined to be the range from
+ *			0 through b_bcount.
+ *
+ *	B_MALLOC	Request that the buffer be allocated from the malloc
+ *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ *	B_VMIO		Indicates that the buffer is tied into an VM object.
+ *			The buffer's data is always PAGE_SIZE aligned even
+ *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
+ *			always at least DEV_BSIZE aligned, though ).
+ *	
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
@@ -356,6 +386,7 @@ void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
+void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
@@ -371,6 +402,7 @@ int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
+
 #endif /* KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index d2ce212..2e88ca7 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.64 1999/03/02 04:04:28 mckusick Exp $
+ * $Id: buf.h,v 1.65 1999/03/12 02:24:55 julian Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -78,6 +78,19 @@ struct iodone_chain {
 
 /*
  * The buffer header describes an I/O operation in the kernel.
+ *
+ * NOTES:
+ *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
+ *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
+ *	originally requested buffer size and can serve as a bounds check
+ *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
+ *
+ *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
+ *	ranges of dirty data that need to be written to backing store.
+ *	The range is typically clipped at b_bcount ( not b_bufsize ).
+ *
+ *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
+ *	completes, b_resid is usually 0 indicating 100% success.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
@@ -109,8 +122,10 @@ struct buf {
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
+#if 0
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
+#endif
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
@@ -151,9 +166,24 @@ struct buf {
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
- *			The situation where B_DELWRI is set and B_CACHE gets
- *			cleared MUST be committed to disk so B_DELWRI can
- *			also be cleared.
+ *			The situation where B_DELWRI is set and B_CACHE is
+ *			clear MUST be committed to disk by getblk() so 
+ *			B_DELWRI can also be cleared.  See the comments for
+ *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
+ *			the caller is expected to clear B_ERROR|B_INVAL,
+ *			set B_READ, and initiate an I/O.
+ *
+ *			The 'entire buffer' is defined to be the range from
+ *			0 through b_bcount.
+ *
+ *	B_MALLOC	Request that the buffer be allocated from the malloc
+ *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
+ *
+ *	B_VMIO		Indicates that the buffer is tied into an VM object.
+ *			The buffer's data is always PAGE_SIZE aligned even
+ *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
+ *			always at least DEV_BSIZE aligned, though ).
+ *	
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
@@ -356,6 +386,7 @@ void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
+void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
@@ -371,6 +402,7 @@ int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
+
 #endif /* KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 8821440..c80d0a5 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.13 (Berkeley) 4/21/95
- * $Id: ffs_inode.c,v 1.52 1999/01/07 16:14:16 bde Exp $
+ * $Id: ffs_inode.c,v 1.53 1999/01/28 00:57:54 dillon Exp $
  */
 
 #include "opt_quota.h"
@@ -452,6 +452,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	if ((bp->b_flags & B_CACHE) == 0) {
 		curproc->p_stats->p_ru.ru_inblock++;	/* pay for read */
 		bp->b_flags |= B_READ;
+		bp->b_flags &= ~(B_ERROR|B_INVAL);
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ffs_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c
index d4d82f0..c9ae4dd 100644
--- a/sys/ufs/mfs/mfs_vnops.c
+++ b/sys/ufs/mfs/mfs_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_vnops.c	8.11 (Berkeley) 5/22/95
- * $Id: mfs_vnops.c,v 1.42 1999/01/28 00:57:55 dillon Exp $
+ * $Id: mfs_vnops.c,v 1.43 1999/04/11 02:28:32 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -127,6 +127,9 @@ mfs_fsync(ap)
  *	We implement the B_FREEBUF strategy.  We can't just madvise()
  *	here because we have to do it in the correct order vs other bio
  *	requests, so we queue it.
+ *
+ *	Note: geteblk() sets B_INVAL.  We leave it set to guarentee buffer
+ *	throw-away on brelse()? XXX
  */
 
 static int
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index f40ff33..3ea5965 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.7 (Berkeley) 3/21/95
- * $Id: ufs_bmap.c,v 1.24 1998/10/27 11:47:08 bde Exp $
+ * $Id: ufs_bmap.c,v 1.25 1999/01/28 00:57:55 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -228,6 +228,7 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
+			bp->b_flags &= ~(B_INVAL|B_ERROR);
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp->b_vp, bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 047f10f..882d52e 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -66,7 +66,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_fault.c,v 1.100 1999/02/17 09:08:29 dillon Exp $
+ * $Id: vm_fault.c,v 1.101 1999/02/25 06:00:52 alc Exp $
  */
 
 /*
@@ -409,6 +409,12 @@ readrest:
 					firstpindex = fs.first_pindex -
 						2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1);
 
+				/*
+				 * note: partially valid pages cannot be 
+				 * included in the lookahead - NFS piecemeal
+				 * writes will barf on it badly.
+				 */
+
 				for(tmppindex = fs.first_pindex - 1;
 					tmppindex >= firstpindex;
 					--tmppindex) {
@@ -552,12 +558,16 @@ readrest:
 			}
 			fs.first_m = NULL;
 
+			/*
+			 * Zero the page if necessary and mark it valid.
+			 */
 			if ((fs.m->flags & PG_ZERO) == 0) {
 				vm_page_zero_fill(fs.m);
-			}
-			else
+			} else {
 				cnt.v_ozfod++;
+			}
 			cnt.v_zfod++;
+			fs.m->valid = VM_PAGE_BITS_ALL;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			if (fs.object != fs.first_object) {
@@ -788,14 +798,24 @@ readrest:
 #endif
 
 	unlock_things(&fs);
-	fs.m->valid = VM_PAGE_BITS_ALL;
-	vm_page_flag_clear(fs.m, PG_ZERO);
+
+	/*
+	 * Sanity check: page must be completely valid or it is not fit to
+	 * map into user space.  vm_pager_get_pages() ensures this.
+	 */
+
+	if (fs.m->valid != VM_PAGE_BITS_ALL) {
+		vm_page_zero_invalid(fs.m, TRUE);
+		printf("Warning: page %p partially invalid on fault\n", fs.m);
+	}
 
 	pmap_enter(fs.map->pmap, vaddr, VM_PAGE_TO_PHYS(fs.m), prot, wired);
+
 	if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) {
 		pmap_prefault(fs.map->pmap, vaddr, fs.entry);
 	}
 
+	vm_page_flag_clear(fs.m, PG_ZERO);
 	vm_page_flag_set(fs.m, PG_MAPPED|PG_REFERENCED);
 	if (fault_flags & VM_FAULT_HOLD)
 		vm_page_hold(fs.m);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index e07ea63..0d85a94 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
- *	$Id: vm_page.c,v 1.128 1999/03/19 05:21:03 alc Exp $
+ *	$Id: vm_page.c,v 1.129 1999/04/05 19:38:29 julian Exp $
  */
 
 /*
@@ -1460,14 +1460,16 @@ vm_page_bits(int base, int size)
 }
 
 /*
- * set a page valid and clean.  May not block.
+ *	vm_page_set_validclean:
  *
- * In order to maintain consistancy due to the DEV_BSIZE granularity
- * of the valid bits, we have to zero non-DEV_BSIZE aligned portions of 
- * the page at the beginning and end of the valid range when the 
- * associated valid bits are not already set.
+ *	Sets portions of a page valid and clean.  The arguments are expected
+ *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
+ *	of any partial chunks touched by the range.  The invalid portion of
+ *	such chunks will be zero'd.
  *
- * (base + size) must be less then or equal to PAGE_SIZE.
+ *	This routine may not block.
+ *
+ *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(m, base, size)
@@ -1529,8 +1531,35 @@ vm_page_set_validclean(m, base, size)
 		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
 }
 
+#if 0
+
+void
+vm_page_set_dirty(m, base, size)
+	vm_page_t m;
+	int base;
+	int size;
+{
+	m->dirty |= vm_page_bits(base, size);
+}
+
+#endif
+
+void
+vm_page_clear_dirty(m, base, size)
+	vm_page_t m;
+	int base;
+	int size;
+{
+	m->dirty &= ~vm_page_bits(base, size);
+}
+
 /*
- * set a page (partially) invalid.  May not block.
+ *	vm_page_set_invalid:
+ *
+ *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
+ *	valid and dirty bits for the effected areas are cleared.
+ *
+ *	May not block.
  */
 void
 vm_page_set_invalid(m, base, size)
@@ -1540,9 +1569,9 @@ vm_page_set_invalid(m, base, size)
 {
 	int bits;
 
-	m->valid &= ~(bits = vm_page_bits(base, size));
-	if (m->valid == 0)
-		m->dirty &= ~bits;
+	bits = vm_page_bits(base, size);
+	m->valid &= ~bits;
+	m->dirty &= ~bits;
 	m->object->generation++;
 }
 
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 8072f66..abff794 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_page.h,v 1.58 1999/03/15 05:09:48 julian Exp $
+ * $Id: vm_page.h,v 1.59 1999/04/05 19:38:29 julian Exp $
  */
 
 /*
@@ -101,6 +101,10 @@
  *	Fields in this structure are locked either by the lock on the
  *	object that the page belongs to (O) or by the lock on the page
  *	queues (P).
+ *
+ *	The 'valid' and 'dirty' fields are distinct.  A page may have dirty
+ *	bits set without having associated valid bits set.  This is used by
+ *	NFS to implement piecemeal writes.
  */
 
 TAILQ_HEAD(pglist, vm_page);
@@ -404,6 +408,8 @@ void vm_page_wire __P((vm_page_t));
 void vm_page_unqueue __P((vm_page_t));
 void vm_page_unqueue_nowakeup __P((vm_page_t));
 void vm_page_set_validclean __P((vm_page_t, int, int));
+void vm_page_set_dirty __P((vm_page_t, int, int));
+void vm_page_clear_dirty __P((vm_page_t, int, int));
 void vm_page_set_invalid __P((vm_page_t, int, int));
 static __inline boolean_t vm_page_zero_fill __P((vm_page_t));
 int vm_page_is_valid __P((vm_page_t, int, int));
diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c
index 36a905e..dbacceb 100644
--- a/sys/vm/vm_pager.c
+++ b/sys/vm/vm_pager.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pager.c,v 1.44 1999/03/14 09:20:00 julian Exp $
+ * $Id: vm_pager.c,v 1.45 1999/04/11 02:16:27 eivind Exp $
  */
 
 /*
@@ -523,6 +523,9 @@ vm_pager_chain_iodone(struct buf *nbp)
  *	Obtain a physical buffer and chain it to its parent buffer.  When
  *	I/O completes, the parent buffer will be B_SIGNAL'd.  Errors are
  *	automatically propogated to the parent
+ *
+ *	Since these are brand new buffers, we do not have to clear B_INVAL
+ *	and B_ERROR because they are already clear.
  */
 
 struct buf *
diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h
index 82b6574..aff14ab 100644
--- a/sys/vm/vm_pager.h
+++ b/sys/vm/vm_pager.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $Id: vm_pager.h,v 1.20 1999/01/24 02:32:15 dillon Exp $
+ * $Id: vm_pager.h,v 1.21 1999/03/14 09:20:00 julian Exp $
  */
 
 /*
@@ -110,6 +110,14 @@ void flushchainbuf(struct buf *nbp);
 void waitchainbuf(struct buf *bp, int count, int done);
 void autochaindone(struct buf *bp);
 
+/*
+ *	vm_page_get_pages:
+ *
+ *	Retrieve pages from the VM system in order to map them into an object
+ *	( or into VM space somewhere ).  If the pagein was successful, we
+ *	must fully validate it.
+ */
+
 static __inline int
 vm_pager_get_pages(
 	vm_object_t object,
@@ -117,7 +125,13 @@ vm_pager_get_pages(
 	int count,
 	int reqpage
 ) {
-	return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
+	int r;
+
+	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
+	if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
+		vm_page_zero_invalid(m[reqpage], TRUE);
+	}
+	return(r);
 }
 
 static __inline void
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 628bec7..83f379a 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
- *	$Id: vnode_pager.c,v 1.106 1999/04/05 19:38:29 julian Exp $
+ *	$Id: vnode_pager.c,v 1.107 1999/04/10 20:52:11 dt Exp $
  */
 
 /*
@@ -789,7 +789,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 			 * read.
 			 */
 			vm_page_set_validclean(mt, 0, size - tfoff);
-			vm_page_zero_invalid(mt, FALSE);
+			/* handled by vm_fault now */
+			/* vm_page_zero_invalid(mt, FALSE); */
 		}
 		
 		vm_page_flag_clear(mt, PG_ZERO);
author	alc <alc@FreeBSD.org>	1999-05-02 23:57:16 +0000
committer	alc <alc@FreeBSD.org>	1999-05-02 23:57:16 +0000
commit	5cb08a2652f36ddab7172faf6b766038472c1647 (patch)
tree	c47eaa3332628f6c725ca32dda81aa44d24e2ac2 /sys
parent	c75d7e89c3e63bc9b8e9863a5cc985649edf5f9a (diff)
download	FreeBSD-src-5cb08a2652f36ddab7172faf6b766038472c1647.zip FreeBSD-src-5cb08a2652f36ddab7172faf6b766038472c1647.tar.gz