11 files changed, 304 insertions, 252 deletions
diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c
index 582bece..f3d7f11 100644
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@@ -684,6 +684,8 @@ spec_getpages(ap)
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
 	bp->b_resid = 0;
+	bp->b_runningbufspace = bp->b_bufsize;
+	runningbufspace += bp->b_runningbufspace;
 
 	cnt.v_vnodein++;
 	cnt.v_vnodepgsin += pcount;
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 21d447d..9949813 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -85,22 +85,24 @@ static void buf_daemon __P((void));
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
-int runningbufspace;
 int vmiodirenable = FALSE;
+int runningbufspace;
 static vm_offset_t bogus_offset;
 
-static int bufspace, maxbufspace, 
+static int bufspace, maxbufspace,
 	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
 static int bufreusecnt, bufdefragcnt, buffreekvacnt;
-static int maxbdrun;
 static int needsbuffer;
-static int numdirtybuffers, hidirtybuffers;
+static int lorunningspace, hirunningspace, runningbufreq;
+static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
 static int numfreebuffers, lofreebuffers, hifreebuffers;
 static int getnewbufcalls;
 static int getnewbufrestarts;
 
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
 	&numdirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
+	&lodirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
 	&hidirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
@@ -111,6 +113,10 @@ SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
 	&hifreebuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
 	&runningbufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
+	&lorunningspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
+	&hirunningspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
 	&maxbufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
@@ -119,8 +125,6 @@ SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
 	&lobufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
 	&bufspace, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
-	&maxbdrun, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
 	&maxbufmallocspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
@@ -170,9 +174,9 @@ bufhash(struct vnode *vnp, daddr_t bn)
  */
 
 static __inline void
-numdirtywakeup(void)
+numdirtywakeup(int level)
 {
-	if (numdirtybuffers < hidirtybuffers) {
+	if (numdirtybuffers <= level) {
 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
 			wakeup(&needsbuffer);
@@ -204,6 +208,23 @@ bufspacewakeup(void)
 }
 
 /*
+ * runningbufwakeup() - in-progress I/O accounting.
+ *
+ */
+static __inline void
+runningbufwakeup(struct buf *bp)
+{
+	if (bp->b_runningbufspace) {
+		runningbufspace -= bp->b_runningbufspace;
+		bp->b_runningbufspace = 0;
+		if (runningbufreq && runningbufspace <= lorunningspace) {
+			runningbufreq = 0;
+			wakeup(&runningbufreq);
+		}
+	}
+}
+
+/*
  *	bufcountwakeup:
  *
  *	Called when a buffer has been added to one of the free queues to
@@ -225,6 +246,31 @@ bufcountwakeup(void)
 }
 
 /*
+ *	waitrunningbufspace()
+ *
+ *	runningbufspace is a measure of the amount of I/O currently
+ *	running.  This routine is used in async-write situations to
+ *	prevent creating huge backups of pending writes to a device.
+ *	Only asynchronous writes are governed by this function.
+ *
+ *	Reads will adjust runningbufspace, but will not block based on it.
+ *	The read load has a side effect of reducing the allowed write load.
+ *
+ *	This does NOT turn an async write into a sync write.  It waits  
+ *	for earlier writes to complete and generally returns before the
+ *	caller's write has reached the device.
+ */
+static __inline void
+waitrunningbufspace(void)
+{
+	while (runningbufspace > hirunningspace) {
+		++runningbufreq;
+		tsleep(&runningbufreq, PVM, "wdrain", 0);
+	}
+}
+
+
+/*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
@@ -248,7 +294,7 @@ static __inline__
 void
 bd_wakeup(int dirtybuflevel)
 {
-	if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
+	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
@@ -330,6 +376,9 @@ bufinit(void)
 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
 	lobufspace = hibufspace - MAXBSIZE;
 
+	lorunningspace = 512 * 1024;
+	hirunningspace = 1024 * 1024;
+
 /*
  * Limit the amount of malloc memory since it is wired permanently into
  * the kernel space.  Even though this is accounted for in the buffer
@@ -354,6 +403,7 @@ bufinit(void)
 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
+	lodirtybuffers = hidirtybuffers / 2;
 
 /*
  * Try to keep the number of free buffers in the specified range,
@@ -370,8 +420,6 @@ bufinit(void)
  * based on the number of bytes of I/O in-transit that were initiated
  * from buf_daemon.
  */
-	if ((maxbdrun = nswbuf / 4) < 4)
-		maxbdrun = 4;
 
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	bogus_page = vm_page_alloc(kernel_object,
@@ -419,7 +467,6 @@ bremfree(struct buf * bp)
 		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 		bp->b_qindex = QUEUE_NONE;
-		runningbufspace += bp->b_bufsize;
 	} else {
 		if (BUF_REFCNT(bp) <= 1)
 			panic("bremfree: removing a buffer not on a queue");
@@ -659,6 +706,13 @@ bwrite(struct buf * bp)
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
+	} else {
+		/*
+		 * don't allow the async write to saturate the I/O
+		 * system.  There is no chance of deadlock here because
+		 * we are blocking on I/O that is already in-progress.
+		 */
+		waitrunningbufspace();
 	}
 
 	return (0);
@@ -774,11 +828,11 @@ bdwrite(struct buf * bp)
 	bqrelse(bp);
 
 	/*
-	 * Wakeup the buffer flushing daemon if we have saturated the
-	 * buffer cache.
+	 * Wakeup the buffer flushing daemon if we have a lot of dirty
+	 * buffers (midpoint between our recovery point and our stall
+	 * point).
 	 */
-
-	bd_wakeup(hidirtybuffers);
+	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
@@ -817,7 +871,7 @@ bdirty(bp)
 		bp->b_flags |= B_DONE | B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		++numdirtybuffers;
-		bd_wakeup(hidirtybuffers);
+		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 	}
 }
 
@@ -843,7 +897,7 @@ bundirty(bp)
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		--numdirtybuffers;
-		numdirtywakeup();
+		numdirtywakeup(lodirtybuffers);
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
@@ -896,14 +950,12 @@ bowrite(struct buf * bp)
 void
 bwillwrite(void)
 {
-	int slop = hidirtybuffers / 10;
-
-	if (numdirtybuffers > hidirtybuffers + slop) {
+	if (numdirtybuffers >= hidirtybuffers) {
 		int s;
 
 		s = splbio();
-		while (numdirtybuffers > hidirtybuffers) {
-			bd_wakeup(hidirtybuffers);
+		while (numdirtybuffers >= hidirtybuffers) {
+			bd_wakeup(1);
 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
 			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
 		}
@@ -963,7 +1015,7 @@ brelse(struct buf * bp)
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI) {
 			--numdirtybuffers;
-			numdirtywakeup();
+			numdirtywakeup(lodirtybuffers);
 		}
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
@@ -1169,11 +1221,9 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
 		bp->b_flags &= ~B_DELWRI;
 		--numdirtybuffers;
-		numdirtywakeup();
+		numdirtywakeup(lodirtybuffers);
 	}
 
-	runningbufspace -= bp->b_bufsize;
-
 	/*
 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
@@ -1248,8 +1298,6 @@ bqrelse(struct buf * bp)
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 	}
 
-	runningbufspace -= bp->b_bufsize;
-
 	if ((bp->b_flags & B_LOCKED) == 0 &&
 	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
 		bufcountwakeup();
@@ -1309,13 +1357,13 @@ vfs_vmio_release(bp)
 			}
 		}
 	}
-	runningbufspace -= bp->b_bufsize;
 	splx(s);
 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
-	if (bp->b_bufsize)
+	if (bp->b_bufsize) {
 		bufspacewakeup();
+		bp->b_bufsize = 0;
+	}
 	bp->b_npages = 0;
-	bp->b_bufsize = 0;
 	bp->b_flags &= ~B_VMIO;
 	if (bp->b_vp)
 		brelvp(bp);
@@ -1723,27 +1771,6 @@ restart:
 	return(bp);
 }
 
-#if 0
-/*
- *	waitfreebuffers:
- *
- *	Wait for sufficient free buffers.  Only called from normal processes.
- */
-
-static void
-waitfreebuffers(int slpflag, int slptimeo) 
-{
-	while (numfreebuffers < hifreebuffers) {
-		if (numfreebuffers >= hifreebuffers)
-			break;
-		needsbuffer |= VFS_BIO_NEED_FREE;
-		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
-			break;
-	}
-}
-
-#endif
-
 /*
  *	buf_daemon:
  *
@@ -1753,9 +1780,6 @@ waitfreebuffers(int slpflag, int slptimeo)
  */
 
 static struct proc *bufdaemonproc;
-static int bd_interval;
-static int bd_flushto;
-static int bd_flushinc;
 
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
@@ -1783,65 +1807,50 @@ buf_daemon()
 	curproc->p_flag |= P_BUFEXHAUST;
 	s = splbio();
 
-	bd_interval = 5 * hz;	/* dynamically adjusted */
-	bd_flushto = hidirtybuffers;	/* dynamically adjusted */
-	bd_flushinc = 1;
-
 	for (;;) {
 		kthread_suspend_check(bufdaemonproc);
 
 		bd_request = 0;
 
 		/*
-		 * Do the flush.  Limit the number of buffers we flush in one
-		 * go.  The failure condition occurs when processes are writing
-		 * buffers faster then we can dispose of them.  In this case
-		 * we may be flushing so often that the previous set of flushes
-		 * have not had time to complete, causing us to run out of
-		 * physical buffers and block.
+		 * Do the flush.  Limit the amount of in-transit I/O we
+		 * allow to build up, otherwise we would completely saturate
+		 * the I/O system.  Wakeup any waiting processes before we
+		 * normally would so they can run in parallel with our drain.
 		 */
-		{
-			int runcount = maxbdrun;
-
-			while (numdirtybuffers > bd_flushto && runcount) {
-				--runcount;
-				if (flushbufqueues() == 0)
-					break;
-			}
+		while (numdirtybuffers > lodirtybuffers) {
+			if (flushbufqueues() == 0)
+				break;
+			waitrunningbufspace();
+			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 		}
 
-		if (bd_request || 
-		    tsleep(&bd_request, PVM, "psleep", bd_interval) == 0) {
+		/*
+		 * Only clear bd_request if we have reached our low water
+		 * mark.  The buf_daemon normally waits 5 seconds and
+		 * then incrementally flushes any dirty buffers that have
+		 * built up, within reason.
+		 *
+		 * If we were unable to hit our low water mark and couldn't
+		 * find any flushable buffers, we sleep half a second.
+		 * Otherwise we loop immediately.
+		 */
+		if (numdirtybuffers <= lodirtybuffers) {
 			/*
-			 * Another request is pending or we were woken up
-			 * without timing out.  Flush more.
+			 * We reached our low water mark, reset the
+			 * request and sleep until we are needed again.
+			 * The sleep is just so the suspend code works.
 			 */
-			--bd_flushto;
-			if (bd_flushto >= numdirtybuffers - 5) {
-				bd_flushto = numdirtybuffers - 10;
-				bd_flushinc = 1;
-			}
-			if (bd_flushto < 2)
-				bd_flushto = 2;
+			bd_request = 0;
+			tsleep(&bd_request, PVM, "psleep", hz);
 		} else {
 			/*
-			 * We slept and timed out, we can slow down.
+			 * We couldn't find any flushable dirty buffers but
+			 * still have too many dirty buffers, we
+			 * have to sleep and try again.  (rare)
 			 */
-			bd_flushto += bd_flushinc;
-			if (bd_flushto > hidirtybuffers)
-				bd_flushto = hidirtybuffers;
-			++bd_flushinc;
-			if (bd_flushinc > hidirtybuffers / 20 + 1)
-				bd_flushinc = hidirtybuffers / 20 + 1;
+			tsleep(&bd_request, PVM, "qsleep", hz / 2);
 		}
-
-		/*
-		 * Set the interval on a linear scale based on hidirtybuffers
-		 * with a maximum frequency of 1/10 second.
-		 */
-		bd_interval = bd_flushto * 5 * hz / hidirtybuffers;
-		if (bd_interval < hz / 10)
-			bd_interval = hz / 10;
 	}
 }
 
@@ -2097,21 +2106,11 @@ loop:
 	 *
 	 * XXX remove if 0 sections (clean this up after its proven)
          */
-#if 0
-	if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
-#endif
-		if (numfreebuffers == 0) {
-			if (curproc == idleproc)
-				return NULL;
-			needsbuffer |= VFS_BIO_NEED_ANY;
-			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
-			    slptimeo);
-		}
-#if 0
-	} else if (numfreebuffers < lofreebuffers) {
-		waitfreebuffers(slpflag, slptimeo);
+	if (numfreebuffers == 0) {
+		if (curproc == idleproc)
+			return NULL;
+		needsbuffer |= VFS_BIO_NEED_ANY;
 	}
-#endif
 
 	if ((bp = gbincore(vp, blkno))) {
 		/*
@@ -2357,12 +2356,12 @@ allocbuf(struct buf *bp, int size)
 					bp->b_bcount = size;
 				} else {
 					free(bp->b_data, M_BIOBUF);
-					bufmallocspace -= bp->b_bufsize;
-					runningbufspace -= bp->b_bufsize;
-					if (bp->b_bufsize)
+					if (bp->b_bufsize) {
+						bufmallocspace -= bp->b_bufsize;
 						bufspacewakeup();
+						bp->b_bufsize = 0;
+					}
 					bp->b_data = bp->b_kvabase;
-					bp->b_bufsize = 0;
 					bp->b_bcount = 0;
 					bp->b_flags &= ~B_MALLOC;
 				}
@@ -2389,7 +2388,6 @@ allocbuf(struct buf *bp, int size)
 				bp->b_bcount = size;
 				bp->b_flags |= B_MALLOC;
 				bufmallocspace += mbsize;
-				runningbufspace += bp->b_bufsize;
 				return 1;
 			}
 #endif
@@ -2404,11 +2402,11 @@ allocbuf(struct buf *bp, int size)
 				origbuf = bp->b_data;
 				origbufsize = bp->b_bufsize;
 				bp->b_data = bp->b_kvabase;
-				bufmallocspace -= bp->b_bufsize;
-				runningbufspace -= bp->b_bufsize;
-				if (bp->b_bufsize)
+				if (bp->b_bufsize) {
+					bufmallocspace -= bp->b_bufsize;
 					bufspacewakeup();
-				bp->b_bufsize = 0;
+					bp->b_bufsize = 0;
+				}
 				bp->b_flags &= ~B_MALLOC;
 				newbsize = round_page(newbsize);
 			}
@@ -2601,7 +2599,6 @@ allocbuf(struct buf *bp, int size)
 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
 	}
-	runningbufspace += (newbsize - bp->b_bufsize);
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
@@ -2681,6 +2678,7 @@ bufdone(struct buf *bp)
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	bp->b_flags |= B_DONE;
+	runningbufwakeup(bp);
 
 	if (bp->b_iocmd == BIO_DELETE) {
 		brelse(bp);
@@ -2768,18 +2766,8 @@ bufdone(struct buf *bp)
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
-				if (!m) {
+				if (m == NULL)
 					panic("biodone: page disappeared!");
-#if defined(VFS_BIO_DEBUG)
-					printf("biodone: page disappeared\n");
-#endif
-					vm_object_pip_subtract(obj, 1);
-					bp->b_flags &= ~B_CACHE;
-					foff = (foff + PAGE_SIZE) &
-					    ~(off_t)PAGE_MASK;
-					iosize -= resid;
-					continue;
-				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 			}
@@ -2833,6 +2821,7 @@ bufdone(struct buf *bp)
 		if (obj)
 			vm_object_pip_wakeupn(obj, 0);
 	}
+
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
@@ -2860,6 +2849,7 @@ vfs_unbusy_pages(struct buf * bp)
 {
 	int i;
 
+	runningbufwakeup(bp);
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj;
@@ -2939,6 +2929,9 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
 {
 	int i, bogus;
 
+	bp->b_runningbufspace = bp->b_bufsize;
+	runningbufspace += bp->b_runningbufspace;
+
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 29a1879..088dc40 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -247,8 +247,12 @@ single_block_read:
 			printf("S(%ld,%ld,%d) ",
 			    (long)bp->b_lblkno, bp->b_bcount, seqcount);
 #endif
-		if ((bp->b_flags & B_CLUSTER) == 0)
+		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vfs_busy_pages(bp, 0);
+		} else {
+			bp->b_runningbufspace = bp->b_bufsize;
+			runningbufspace += bp->b_runningbufspace;
+		}
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
@@ -283,8 +287,12 @@ single_block_read:
 			}
 #endif
 
-			if ((rbp->b_flags & B_CLUSTER) == 0)
+			if ((rbp->b_flags & B_CLUSTER) == 0) {
 				vfs_busy_pages(rbp, 0);
+			} else {
+				rbp->b_runningbufspace = rbp->b_bufsize;
+				runningbufspace += rbp->b_runningbufspace;
+			}
 			rbp->b_flags &= ~B_INVAL;
 			rbp->b_ioflags &= ~BIO_ERROR;
 			if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index 582bece..f3d7f11 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -684,6 +684,8 @@ spec_getpages(ap)
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
 	bp->b_resid = 0;
+	bp->b_runningbufspace = bp->b_bufsize;
+	runningbufspace += bp->b_runningbufspace;
 
 	cnt.v_vnodein++;
 	cnt.v_vnodepgsin += pcount;
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index a10083f..223c036 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -110,6 +110,7 @@ struct buf {
 	unsigned char b_xflags;		/* extra flags */
 	struct lock b_lock;		/* Buffer lock */
 	long	b_bufsize;		/* Allocated buffer size. */
+	long	b_runningbufspace;	/* when I/O is running, pipelining */
 	caddr_t	b_kvabase;		/* base kva for buffer */
 	int	b_kvasize;		/* size of kva for buffer */
 	daddr_t	b_lblkno;		/* Logical block number. */
@@ -480,6 +481,7 @@ buf_countdeps(struct buf *bp, int i)
 
 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
+extern int	runningbufspace;
 extern int      buf_maxio;              /* nominal maximum I/O for buffer */
 extern struct	buf *buf;		/* The buffer headers. */
 extern char	*buffers;		/* The buffer contents. */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 75462f6..2ab6f3f 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -213,6 +213,7 @@ struct vattr {
 #define	IO_NDELAY	0x10		/* FNDELAY flag set in file table */
 #define	IO_VMIO		0x20		/* data already in VMIO space */
 #define	IO_INVAL	0x40		/* invalidate after I/O */
+#define IO_ASYNC	0x80		/* bawrite rather then bdwrite */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index e1d775c..62ec9e3 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -504,7 +504,9 @@ WRITE(ap)
 
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
-		} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+		} else if (vm_page_count_severe() ||
+			    buf_dirty_count_severe() ||
+			    (ioflag & IO_ASYNC)) {
 			bp->b_flags |= B_CLUSTEROK;
 			bawrite(bp);
 		} else if (xfersize + blkoffset == fs->fs_bsize) {
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 1b2db6e..7cbe750 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1273,6 +1273,7 @@ vm_page_unwire(m, activate)
 				vm_page_queues[PQ_ACTIVE].lcnt++;
 				cnt.v_active_count++;
 			} else {
+				vm_page_flag_clear(m, PG_WINATCFLS);
 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 				m->queue = PQ_INACTIVE;
 				vm_page_queues[PQ_INACTIVE].lcnt++;
@@ -1311,6 +1312,7 @@ _vm_page_deactivate(vm_page_t m, int athead)
 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 		if ((m->queue - m->pc) == PQ_CACHE)
 			cnt.v_reactivated++;
+		vm_page_flag_clear(m, PG_WINATCFLS);
 		vm_page_unqueue(m);
 		if (athead)
 			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 4c31df9..dc8290e 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -242,6 +242,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
  */
 #define	PG_BUSY		0x0001		/* page is in transit (O) */
 #define	PG_WANTED	0x0002		/* someone is waiting for page (O) */
+#define PG_WINATCFLS	0x0004		/* flush dirty page on inactive q */
 #define	PG_FICTITIOUS	0x0008		/* physical page doesn't exist (O) */
 #define	PG_WRITEABLE	0x0010		/* page is mapped writeable */
 #define PG_MAPPED	0x0020		/* page is mapped */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index dbea3d6..943fb11 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -106,7 +106,7 @@
 /* the kernel process "vm_pageout"*/
 static void vm_pageout __P((void));
 static int vm_pageout_clean __P((vm_page_t));
-static int vm_pageout_scan __P((void));
+static void vm_pageout_scan __P((int pass));
 static int vm_pageout_free_page_calc __P((vm_size_t count));
 struct proc *pageproc;
 
@@ -140,14 +140,13 @@ static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 #endif
 extern int vm_swap_size;
+static int vm_max_launder = 32;
 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
 static int vm_pageout_full_stats_interval = 0;
-static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0;
+static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
 static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;
 
-static int max_page_launder=100;
-static int vm_pageout_actcmp=0;
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
@@ -157,7 +156,10 @@ static int vm_swap_idle_enabled=0;
 #endif
 
 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
-	CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt");
+	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
+
+SYSCTL_INT(_vm, OID_AUTO, max_launder,
+	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
@@ -189,12 +191,6 @@ SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
-SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
-	CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
-SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
-	CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");
-
-
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
@@ -509,7 +505,7 @@ vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
 			} else if (p->queue == PQ_ACTIVE) {
 				if ((p->flags & PG_REFERENCED) == 0) {
 					p->act_count -= min(p->act_count, ACT_DECLINE);
-					if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) {
+					if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
 						vm_page_protect(p, VM_PROT_NONE);
 						vm_page_deactivate(p);
 					} else {
@@ -627,20 +623,21 @@ vm_pageout_page_free(vm_page_t m) {
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
-static int
-vm_pageout_scan()
+static void
+vm_pageout_scan(int pass)
 {
 	vm_page_t m, next;
 	struct vm_page marker;
+	int save_page_shortage;
+	int save_inactive_count;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
-	int maxlaunder;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
-	int force_wakeup = 0;
 	int actcount;
 	int vnodes_skipped = 0;
+	int maxlaunder;
 	int s;
 
 	/*
@@ -651,27 +648,13 @@ vm_pageout_scan()
 	addl_page_shortage_init = vm_pageout_deficit;
 	vm_pageout_deficit = 0;
 
-	if (max_page_launder == 0)
-		max_page_launder = 1;
-
 	/*
 	 * Calculate the number of pages we want to either free or move
-	 * to the cache.  Be more agressive if we aren't making our target.
+	 * to the cache.
 	 */
-
-	page_shortage = vm_paging_target() +
-		addl_page_shortage_init + vm_pageout_actcmp;
-
-	/*
-	 * Figure out how agressively we should flush dirty pages.
-	 */
-	{
-		int factor = vm_pageout_actcmp;
-
-		maxlaunder = cnt.v_inactive_target / 3 + factor;
-		if (maxlaunder > max_page_launder + factor)
-			maxlaunder = max_page_launder + factor;
-	}
+	page_shortage = vm_paging_target() + addl_page_shortage_init;
+	save_page_shortage = page_shortage;
+	save_inactive_count = cnt.v_inactive_count;
 
 	/*
 	 * Initialize our marker
@@ -687,8 +670,22 @@ vm_pageout_scan()
 	 * we have scanned the entire inactive queue.  Note that m->act_count
 	 * is not used to form decisions for the inactive queue, only for the
 	 * active queue.
+	 *
+	 * maxlaunder limits the number of dirty pages we flush per scan.
+	 * For most systems a smaller value (16 or 32) is more robust under
+	 * extreme memory and disk pressure because any unnecessary writes
+	 * to disk can result in extreme performance degredation.  However,
+	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
+	 * used) will die horribly with limited laundering.  If the pageout
+	 * daemon cannot clean enough pages in the first pass, we let it go
+	 * all out in succeeding passes.
 	 */
 
+	if ((maxlaunder = vm_max_launder) <= 1)
+		maxlaunder = 1;
+	if (pass)
+		maxlaunder = 10000;
+
 rescan0:
 	addl_page_shortage = addl_page_shortage_init;
 	maxscan = cnt.v_inactive_count;
@@ -792,12 +789,32 @@ rescan0:
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
 			--page_shortage;
-
-		/*
-		 * Dirty pages need to be paged out.  Note that we clean
-		 * only a limited number of pages per pagedaemon pass.
-		 */
+		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
+			/*
+			 * Dirty pages need to be paged out, but flushing
+			 * a page is extremely expensive verses freeing
+			 * a clean page.  Rather then artificially limiting
+			 * the number of pages we can flush, we instead give
+			 * dirty pages extra priority on the inactive queue
+			 * by forcing them to be cycled through the queue
+			 * twice before being flushed, after which the
+			 * (now clean) page will cycle through once more
+			 * before being freed.  This significantly extends
+			 * the thrash point for a heavily loaded machine.
+			 */
+			s = splvm();
+			vm_page_flag_set(m, PG_WINATCFLS);
+			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+			splx(s);
 		} else if (maxlaunder > 0) {
+			/*
+			 * We always want to try to flush some dirty pages if
+			 * we encounter them, to keep the system stable.
+			 * Normally this number is small, but under extreme
+			 * pressure where there are insufficient clean pages
+			 * on the inactive queue, we may have to go all out.
+			 */
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp;
@@ -826,29 +843,24 @@ rescan0:
 			}
 
 			/*
-			 * Presumably we have sufficient free memory to do
-			 * the more sophisticated checks and locking required
-			 * for vnodes.
-			 *
-			 * The object is already known NOT to be dead.  The
-			 * vget() may still block, though, because 
-			 * VOP_ISLOCKED() doesn't check to see if an inode
-			 * (v_data) is associated with the vnode.  If it isn't,
-			 * vget() will load in it from disk.  Worse, vget()
-			 * may actually get stuck waiting on "inode" if another
-			 * process is in the process of bringing the inode in.
-			 * This is bad news for us either way.
+			 * The object is already known NOT to be dead.   It
+			 * is possible for the vget() to block the whole
+			 * pageout daemon, but the new low-memory handling
+			 * code should prevent it.
 			 *
-			 * So for the moment we check v_data == NULL as a
-			 * workaround.  This means that vnodes which do not
-			 * use v_data in the way we expect probably will not
-			 * wind up being paged out by the pager and it will be
-			 * up to the syncer to get them.  That's better then
-			 * us blocking here.
+			 * The previous code skipped locked vnodes and, worse,
+			 * reordered pages in the queue.  This results in
+			 * completely non-deterministic operation and, on a
+			 * busy system, can lead to extremely non-optimal
+			 * pageouts.  For example, it can cause clean pages
+			 * to be freed and dirty pages to be moved to the end
+			 * of the queue.  Since dirty pages are also moved to
+			 * the end of the queue once-cleaned, this gives
+			 * way too large a weighting to defering the freeing
+			 * of dirty pages.
 			 *
-			 * This whole code section is bogus - we need to fix
-			 * the vnode pager to handle vm_page_t's without us
-			 * having to do any sophisticated VOP tests.
+			 * XXX we need to be able to apply a timeout to the
+			 * vget() lock attempt.
 			 */
 
 			if (object->type == OBJT_VNODE) {
@@ -857,19 +869,8 @@ rescan0:
 				mp = NULL;
 				if (vp->v_type == VREG)
 					vn_start_write(vp, &mp, V_NOWAIT);
-				if (VOP_ISLOCKED(vp, NULL) ||
-				    vp->v_data == NULL ||
-				    vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
+				if (vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
 					vn_finished_write(mp);
-					if ((m->queue == PQ_INACTIVE) &&
-						(m->hold_count == 0) &&
-						(m->busy == 0) &&
-						(m->flags & PG_BUSY) == 0) {
-						s = splvm();
-						TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-						TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-						splx(s);
-					}
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					continue;
@@ -924,18 +925,23 @@ rescan0:
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
-			 * start the cleaning operation.  maxlaunder nominally
-			 * counts I/O cost (seeks) rather then bytes.
+			 * start the cleaning operation. 
 			 *
 			 * This operation may cluster, invalidating the 'next'
 			 * pointer.  To prevent an inordinate number of
 			 * restarts we use our marker to remember our place.
+			 *
+			 * decrement page_shortage on success to account for
+			 * the (future) cleaned page.  Otherwise we could wind
+			 * up laundering or cleaning too many pages.
 			 */
 			s = splvm();
 			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
 			splx(s);
-			if (vm_pageout_clean(m) != 0)
+			if (vm_pageout_clean(m) != 0) {
+				--page_shortage;
 				--maxlaunder;
+			}
 			s = splvm();
 			next = TAILQ_NEXT(&marker, pageq);
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
@@ -948,28 +954,12 @@ rescan0:
 	}
 
 	/*
-	 * If we were not able to meet our target, increase actcmp
-	 */
-
-	if (vm_page_count_min()) {
-		if (vm_pageout_actcmp < ACT_MAX / 2)
-			vm_pageout_actcmp += ACT_ADVANCE;
-	} else {
-		if (vm_pageout_actcmp < ACT_DECLINE)
-			vm_pageout_actcmp = 0;
-		else
-			vm_pageout_actcmp -= ACT_DECLINE;
-	}
-
-	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
 	 */
-
 	page_shortage = vm_paging_target() +
 		cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;
-	page_shortage += vm_pageout_actcmp;
 
 	/*
 	 * Scan the active queue for things we can deactivate. We nominally
@@ -1043,9 +1033,9 @@ rescan0:
 			splx(s);
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
-			if (vm_pageout_algorithm_lru ||
-			    (m->object->ref_count == 0) || 
-			    (m->act_count <= vm_pageout_actcmp)) {
+			if (vm_pageout_algorithm ||
+			    m->object->ref_count == 0 ||
+			    m->act_count == 0) {
 				page_shortage--;
 				if (m->object->ref_count == 0) {
 					vm_page_protect(m, VM_PROT_NONE);
@@ -1175,7 +1165,6 @@ rescan0:
 			wakeup(&cnt.v_free_count);
 		}
 	}
-	return force_wakeup;
 }
 
 /*
@@ -1254,11 +1243,13 @@ vm_pageout_page_stats()
 		} else {
 			if (m->act_count == 0) {
 				/*
-				 * We turn off page access, so that we have more accurate
-				 * RSS stats.  We don't do this in the normal page deactivation
-				 * when the system is loaded VM wise, because the cost of
-				 * the large number of page protect operations would be higher
-				 * than the value of doing the operation.
+				 * We turn off page access, so that we have
+				 * more accurate RSS stats.  We don't do this
+				 * in the normal page deactivation when the
+				 * system is loaded VM wise, because the
+				 * cost of the large number of page protect
+				 * operations would be higher than the value
+				 * of doing the operation.
 				 */
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_deactivate(m);
@@ -1307,6 +1298,7 @@ vm_size_t count;
 static void
 vm_pageout()
 {
+	int pass;
 
 	mtx_enter(&Giant, MTX_DEF);
 
@@ -1320,11 +1312,18 @@ vm_pageout()
 
 	vm_pageout_free_page_calc(cnt.v_page_count);
 	/*
-	 * free_reserved needs to include enough for the largest swap pager
-	 * structures plus enough for any pv_entry structs when paging.
+	 * v_free_target and v_cache_min control pageout hysteresis.  Note
+	 * that these are more a measure of the VM cache queue hysteresis
+	 * then the VM free queue.  Specifically, v_free_target is the
+	 * high water mark (free+cache pages).
+	 *
+	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
+	 * low water mark, while v_free_min is the stop.  v_cache_min must
+	 * be big enough to handle memory needs while the pageout daemon
+	 * is signalled and run to free more pages.
 	 */
 	if (cnt.v_free_count > 6144)
-		cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved;
+		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
 	else
 		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
 
@@ -1362,10 +1361,9 @@ vm_pageout()
 	if (vm_pageout_stats_free_max == 0)
 		vm_pageout_stats_free_max = 5;
 
-	max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16);
-
 	curproc->p_flag |= P_BUFEXHAUST;
 	swap_pager_swap_init();
+	pass = 0;
 	/*
 	 * The pageout daemon is never done, so loop forever.
 	 */
@@ -1386,19 +1384,27 @@ vm_pageout()
 		}
 		if (vm_pages_needed) {
 			/*
-			 * Still not done, sleep a bit and go again
+			 * Still not done, take a second pass without waiting
+			 * (unlimited dirty cleaning), otherwise sleep a bit
+			 * and try again.
 			 */
-			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
+			++pass;
+			if (pass > 1)
+				tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
 		} else {
 			/*
-			 * Good enough, sleep & handle stats
+			 * Good enough, sleep & handle stats.  Prime the pass
+			 * for the next run.
 			 */
+			if (pass > 1)
+				pass = 1;
+			else
+				pass = 0;
 			error = tsleep(&vm_pages_needed,
 				PVM, "psleep", vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
-				if (vm_pageout_actcmp > 0)
-					--vm_pageout_actcmp;
 				splx(s);
+				pass = 0;
 				vm_pageout_page_stats();
 				continue;
 			}
@@ -1407,7 +1413,7 @@ vm_pageout()
 		if (vm_pages_needed)
 			cnt.v_pdwakeups++;
 		splx(s);
-		vm_pageout_scan();
+		vm_pageout_scan(pass);
 		vm_pageout_deficit = 0;
 	}
 }
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 3dd12ec..c79f62a 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -300,10 +300,29 @@ vnode_pager_setsize(vp, nsize)
 
 			m = vm_page_lookup(object, OFF_TO_IDX(nsize));
 			if (m) {
+				int base = (int)nsize & PAGE_MASK;
+				int size = PAGE_SIZE - base;
+
+				/*
+				 * Clear out partial-page garbage in case
+				 * the page has been mapped.
+				 */
 				kva = vm_pager_map_page(m);
-				bzero((caddr_t) kva + (nsize & PAGE_MASK),
-				    (int) (round_page(nsize) - nsize));
+				bzero((caddr_t)kva + base, size);
 				vm_pager_unmap_page(kva);
+
+				/*
+				 * Clear out partial-page dirty bits.  This
+				 * has the side effect of setting the valid
+				 * bits, but that is ok.  There are a bunch
+				 * of places in the VM system where we expected
+				 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
+				 * case is one of them.  If the page is still
+				 * partially dirty, make it fully dirty.
+				 */
+				vm_page_set_validclean(m, base, size);
+				if (m->dirty != 0)
+					m->dirty = VM_PAGE_BITS_ALL;
 			}
 		}
 	}
@@ -424,6 +443,8 @@ vnode_pager_input_smlfs(object, m)
 			pbgetvp(dp, bp);
 			bp->b_bcount = bsize;
 			bp->b_bufsize = bsize;
+			bp->b_runningbufspace = bp->b_bufsize;
+			runningbufspace += bp->b_runningbufspace;
 
 			/* do the input */
 			BUF_STRATEGY(bp);
@@ -742,6 +763,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 	pbgetvp(dp, bp);
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
+	bp->b_runningbufspace = bp->b_bufsize;
+	runningbufspace += bp->b_runningbufspace;
 
 	cnt.v_vnodein++;
 	cnt.v_vnodepgsin += count;
@@ -888,6 +911,11 @@ vnode_pager_putpages(object, m, count, sync, rtvals)
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
+ *
+ * This is typically called indirectly via the pageout daemon and
+ * clustering has already typically occured, so in general we ask the
+ * underlying filesystem to write the data out asynchronously rather
+ * then delayed.
  */
 int
 vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
@@ -938,8 +966,13 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
 		}
 	}
 
+	/*
+	 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
+	 * rather then a bdwrite() to prevent paging I/O from saturating 
+	 * the buffer cache.
+	 */
 	ioflags = IO_VMIO;
-	ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: 0;
+	ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC;
 	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
 
 	aiov.iov_base = (caddr_t) 0;