14 files changed, 315 insertions, 185 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 17def1b..9a9aae7 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -597,8 +597,14 @@ bwrite(struct buf * bp)
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
+	 *
+	 * This optimization eats a lot of memory.  If we have a page
+	 * or buffer shortfall we can't do it.
 	 */
-	if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) {
+	if ((bp->b_xflags & BX_BKGRDWRITE) && 
+	    (bp->b_flags & B_ASYNC) &&
+	    !vm_page_count_severe() &&
+	    !buf_dirty_count_severe()) {
 		if (bp->b_iodone != NULL) {
 			printf("bp->b_iodone = %p\n", bp->b_iodone);
 			panic("bwrite: need chained iodone");
@@ -682,7 +688,10 @@ vfs_backgroundwritedone(bp)
 	/*
 	 * Clear the BX_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
+	 * If BX_BKGRDINPROG is not set in the original buffer it must
+	 * have been released and re-instantiated - which is not legal.
 	 */
+	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
 	origbp->b_xflags &= ~BX_BKGRDINPROG;
 	if (origbp->b_xflags & BX_BKGRDWAIT) {
 		origbp->b_xflags &= ~BX_BKGRDWAIT;
@@ -903,6 +912,15 @@ bwillwrite(void)
 }
 
 /*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+	return(numdirtybuffers >= hidirtybuffers);
+}
+
+/*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
@@ -964,10 +982,14 @@ brelse(struct buf * bp)
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
+	 *
+	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+	 * on pages to return pages to the VM page queues.
 	 */
-
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
+	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
+		bp->b_flags |= B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
@@ -989,8 +1011,7 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
-		 (bp->b_flags & B_DELWRI) &&
-		 (bp->b_xflags & BX_BKGRDINPROG))
+		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
@@ -1017,32 +1038,40 @@ brelse(struct buf * bp)
 		 *
 		 * See man buf(9) for more information
 		 */
-
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 
 		for (i = 0; i < bp->b_npages; i++) {
+			int had_bogus = 0;
+
 			m = bp->b_pages[i];
 			vm_page_flag_clear(m, PG_ZERO);
-			if (m == bogus_page) {
 
+			/*
+			 * If we hit a bogus page, fixup *all* the bogus pages
+			 * now.
+			 */
+			if (m == bogus_page) {
 				VOP_GETVOBJECT(vp, &obj);
 				poff = OFF_TO_IDX(bp->b_offset);
+				had_bogus = 1;
 
 				for (j = i; j < bp->b_npages; j++) {
-					m = bp->b_pages[j];
-					if (m == bogus_page) {
-						m = vm_page_lookup(obj, poff + j);
-						if (!m) {
+					vm_page_t mtmp;
+					mtmp = bp->b_pages[j];
+					if (mtmp == bogus_page) {
+						mtmp = vm_page_lookup(obj, poff + j);
+						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
-						bp->b_pages[j] = m;
+						bp->b_pages[j] = mtmp;
 					}
 				}
 
 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 				}
+				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
 				int poffset = foff & PAGE_MASK;
@@ -1051,9 +1080,11 @@ brelse(struct buf * bp)
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_set_invalid(m, poffset, presid);
+				if (had_bogus)
+					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
@@ -1171,7 +1202,7 @@ brelse(struct buf * bp)
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
- * it.
+ * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
@@ -1203,6 +1234,15 @@ bqrelse(struct buf * bp)
 	} else if (bp->b_flags & B_DELWRI) {
 		bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are too low on memory, we have to try to free the
+		 * buffer (most importantly: the wired pages making up its
+		 * backing store) *now*.
+		 */
+		splx(s);
+		brelse(bp);
+		return;
 	} else {
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
@@ -1264,6 +1304,8 @@ vfs_vmio_release(bp)
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
+			} else if (vm_page_count_severe()) {
+				vm_page_try_to_cache(m);
 			}
 		}
 	}
@@ -1419,15 +1461,15 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
-	int isspecial;
 	static int flushingbufs;
 
-	if (curproc != idleproc &&
-	    (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
-		isspecial = 0;
-	else
-		isspecial = 1;
-	
+	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+
 	++getnewbufcalls;
 	--getnewbufrestarts;
 restart:
@@ -1445,42 +1487,28 @@ restart:
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
+	nqindex = QUEUE_EMPTYKVA;
+	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 
-	if (isspecial == 0 && numfreebuffers < lofreebuffers) {
+	if (nbp == NULL) {
 		/*
-		 * This will cause an immediate failure
+		 * If no EMPTYKVA buffers and we are either
+		 * defragging or reusing, locate a CLEAN buffer
+		 * to free or reuse.  If bufspace useage is low
+		 * skip this step so we can allocate a new buffer.
 		 */
-		nqindex = QUEUE_CLEAN;
-		nbp = NULL;
-	} else {
+		if (defrag || bufspace >= lobufspace) {
+			nqindex = QUEUE_CLEAN;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+		}
+
 		/*
-		 * Locate a buffer which already has KVA assigned.  First
-		 * try EMPTYKVA buffers.
+		 * Nada.  If we are allowed to allocate an EMPTY 
+		 * buffer, go get one.
 		 */
-		nqindex = QUEUE_EMPTYKVA;
-		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
-		if (nbp == NULL) {
-			/*
-			 * If no EMPTYKVA buffers and we are either
-			 * defragging or reusing, locate a CLEAN buffer
-			 * to free or reuse.  If bufspace useage is low
-			 * skip this step so we can allocate a new buffer.
-			 */
-			if (defrag || bufspace >= lobufspace) {
-				nqindex = QUEUE_CLEAN;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
-			}
-
-			/*
-			 * Nada.  If we are allowed to allocate an EMPTY 
-			 * buffer, go get one.
-			 */
-			if (nbp == NULL && defrag == 0 && 
-			    (isspecial || bufspace < hibufspace)) {
-				nqindex = QUEUE_EMPTY;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
-			}
+		if (nbp == NULL && defrag == 0 && bufspace < hibufspace) {
+			nqindex = QUEUE_EMPTY;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		}
 	}
 
@@ -1610,26 +1638,16 @@ restart:
 			goto restart;
 		}
 
-		/*
-		 * If we are a normal process then deal with bufspace
-		 * hysteresis.  A normal process tries to keep bufspace
-		 * between lobufspace and hibufspace.  Note: if we encounter
-		 * a buffer with b_kvasize == 0 then it means we started
-		 * our scan on the EMPTY list and should allocate a new
-		 * buffer.
-		 */
-		if (isspecial == 0) {
-			if (bufspace > hibufspace)
-				flushingbufs = 1;
-			if (flushingbufs && bp->b_kvasize != 0) {
-				bp->b_flags |= B_INVAL;
-				bfreekva(bp);
-				brelse(bp);
-				goto restart;
-			}
-			if (bufspace < lobufspace)
-				flushingbufs = 0;
+		if (bufspace >= hibufspace)
+			flushingbufs = 1;
+		if (flushingbufs && bp->b_kvasize != 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
 		}
+		if (bufspace < lobufspace)
+			flushingbufs = 0;
 		break;
 	}
 
@@ -1705,6 +1723,7 @@ restart:
 	return(bp);
 }
 
+#if 0
 /*
  *	waitfreebuffers:
  *
@@ -1723,6 +1742,8 @@ waitfreebuffers(int slpflag, int slptimeo)
 	}
 }
 
+#endif
+
 /*
  *	buf_daemon:
  *
@@ -2073,8 +2094,12 @@ loop:
          * If this check ever becomes a bottleneck it may be better to
          * move it into the else, when gbincore() fails.  At the moment
          * it isn't a problem.
+	 *
+	 * XXX remove if 0 sections (clean this up after its proven)
          */
+#if 0
 	if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
+#endif
 		if (numfreebuffers == 0) {
 			if (curproc == idleproc)
 				return NULL;
@@ -2082,9 +2107,11 @@ loop:
 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
 			    slptimeo);
 		}
+#if 0
 	} else if (numfreebuffers < lofreebuffers) {
 		waitfreebuffers(slpflag, slptimeo);
 	}
+#endif
 
 	if ((bp = gbincore(vp, blkno))) {
 		/*
@@ -2468,7 +2495,13 @@ allocbuf(struct buf *bp, int size)
 
 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
-					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
+					/*
+					 * note: must allocate system pages
+					 * since blocking here could intefere
+					 * with paging I/O, no matter which
+					 * process we are.
+					 */
+					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
 					if (m == NULL) {
 						VM_WAIT;
 						vm_pageout_deficit += desiredpages - bp->b_npages;
@@ -2671,7 +2704,7 @@ bufdone(struct buf *bp)
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
-		int i, resid;
+		int i;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
@@ -2722,16 +2755,29 @@ bufdone(struct buf *bp)
 
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
+			int resid;
+
+			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+			if (resid > iosize)
+				resid = iosize;
+
+			/*
+			 * cleanup bogus pages, restoring the originals
+			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (!m) {
+					panic("biodone: page disappeared!");
 #if defined(VFS_BIO_DEBUG)
 					printf("biodone: page disappeared\n");
 #endif
 					vm_object_pip_subtract(obj, 1);
 					bp->b_flags &= ~B_CACHE;
+					foff = (foff + PAGE_SIZE) &
+					    ~(off_t)PAGE_MASK;
+					iosize -= resid;
 					continue;
 				}
 				bp->b_pages[i] = m;
@@ -2744,9 +2790,6 @@ bufdone(struct buf *bp)
 				    (unsigned long)foff, m->pindex);
 			}
 #endif
-			resid = IDX_TO_OFF(m->pindex + 1) - foff;
-			if (resid > iosize)
-				resid = iosize;
 
 			/*
 			 * In the write case, the valid and clean bits are
@@ -2784,7 +2827,7 @@ bufdone(struct buf *bp)
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
-			foff += resid;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		if (obj)
@@ -2862,7 +2905,7 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 	 * of the buffer.
 	 */
 	soff = off;
-	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
+	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
@@ -2948,7 +2991,7 @@ retry:
 				bp->b_pages[i] = bogus_page;
 				bogus++;
 			}
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		if (bogus)
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
@@ -2976,7 +3019,7 @@ vfs_clean_pages(struct buf * bp)
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
-			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			vm_ooffset_t eoff = noff;
 
 			if (eoff > bp->b_offset + bp->b_bufsize)
@@ -3104,9 +3147,14 @@ vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 
 tryagain:
 
+		/*
+		 * note: must allocate system pages since blocking here
+		 * could intefere with paging I/O, no matter which
+		 * process we are.
+		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
-		    VM_ALLOC_NORMAL);
+		    VM_ALLOC_SYSTEM);
 		if (!p) {
 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 			VM_WAIT;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 4f1aecf..29a1879 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -48,6 +48,7 @@
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
@@ -665,6 +666,11 @@ cluster_write(bp, filesize, seqcount)
 			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are low on memory, get it going NOW
+		 */
+		bawrite(bp);
 	} else {
 		/*
 		 * In the middle of a cluster, so just delay the I/O for now.
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index fd81bc8..cb46c34 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -1438,10 +1438,14 @@ vget(vp, flags, p)
 	if ((flags & LK_INTERLOCK) == 0)
 		mtx_enter(&vp->v_interlock, MTX_DEF);
 	if (vp->v_flag & VXLOCK) {
-		vp->v_flag |= VXWANT;
-		mtx_exit(&vp->v_interlock, MTX_DEF);
-		tsleep((caddr_t)vp, PINOD, "vget", 0);
-		return (ENOENT);
+		if (vp->v_vxproc == curproc) {
+			printf("VXLOCK interlock avoided\n");
+		} else {
+			vp->v_flag |= VXWANT;
+			mtx_exit(&vp->v_interlock, MTX_DEF);
+			tsleep((caddr_t)vp, PINOD, "vget", 0);
+			return (ENOENT);
+		}
 	}
 
 	vp->v_usecount++;
@@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
+	vp->v_vxproc = curproc;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
@@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
+	vp->v_vxproc = NULL;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index fd81bc8..cb46c34 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1438,10 +1438,14 @@ vget(vp, flags, p)
 	if ((flags & LK_INTERLOCK) == 0)
 		mtx_enter(&vp->v_interlock, MTX_DEF);
 	if (vp->v_flag & VXLOCK) {
-		vp->v_flag |= VXWANT;
-		mtx_exit(&vp->v_interlock, MTX_DEF);
-		tsleep((caddr_t)vp, PINOD, "vget", 0);
-		return (ENOENT);
+		if (vp->v_vxproc == curproc) {
+			printf("VXLOCK interlock avoided\n");
+		} else {
+			vp->v_flag |= VXWANT;
+			mtx_exit(&vp->v_interlock, MTX_DEF);
+			tsleep((caddr_t)vp, PINOD, "vget", 0);
+			return (ENOENT);
+		}
 	}
 
 	vp->v_usecount++;
@@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
+	vp->v_vxproc = curproc;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
@@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
+	vp->v_vxproc = NULL;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 448a2a6..b7cea77 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -642,12 +642,14 @@ debug_vn_lock(vp, flags, p, filename, line)
 	do {
 		if ((flags & LK_INTERLOCK) == 0)
 			mtx_enter(&vp->v_interlock, MTX_DEF);
-		if (vp->v_flag & VXLOCK) {
+		if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) {
 			vp->v_flag |= VXWANT;
 			mtx_exit(&vp->v_interlock, MTX_DEF);
 			tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
 			error = ENOENT;
 		} else {
+			if (vp->v_vxproc != NULL)
+				printf("VXLOCK interlock avoided in vn_lock\n");
 #ifdef	DEBUG_LOCKS
 			vp->filename = filename;
 			vp->line = line;
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index d085de6..a10083f 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -494,6 +494,7 @@ struct uio;
 caddr_t bufhashinit __P((caddr_t));
 void	bufinit __P((void));
 void	bwillwrite __P((void));
+int	buf_dirty_count_severe __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 582d00c..75462f6 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -129,6 +129,7 @@ struct vnode {
 		short	vpi_events;		/* what they are looking for */
 		short	vpi_revents;		/* what has happened */
 	} v_pollinfo;
+	struct proc *v_vxproc;			/* proc owning VXLOCK */
 #ifdef	DEBUG_LOCKS
 	const char *filename;			/* Source file doing locking */
 	int line;				/* Line number doing locking */
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 30f36ee7..a8ae464 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -45,6 +45,7 @@
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
 #include <sys/stat.h>
 
 #include <vm/vm.h>
@@ -111,6 +112,8 @@ ffs_update(vp, waitfor)
 	    ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
 	if (waitfor && !DOINGASYNC(vp)) {
 		return (bwrite(bp));
+	} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+		return (bwrite(bp));
 	} else {
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 98ad959..c6ac0bd 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -91,6 +91,8 @@ MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
 MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
 MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
 
+#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
+
 #define	D_PAGEDEP	0
 #define	D_INODEDEP	1
 #define	D_NEWBLK	2
@@ -802,7 +804,7 @@ top:
 		goto top;
 	}
 	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
-		M_WAITOK);
+		M_SOFTDEP_FLAGS);
 	bzero(pagedep, sizeof(struct pagedep));
 	pagedep->pd_list.wk_type = D_PAGEDEP;
 	pagedep->pd_mnt = mp;
@@ -879,7 +881,7 @@ top:
 	}
 	num_inodedep += 1;
 	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
-		M_INODEDEP, M_WAITOK);
+		M_INODEDEP, M_SOFTDEP_FLAGS);
 	inodedep->id_list.wk_type = D_INODEDEP;
 	inodedep->id_fs = fs;
 	inodedep->id_ino = inum;
@@ -941,7 +943,7 @@ top:
 	if (sema_get(&newblk_in_progress, 0) == 0)
 		goto top;
 	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
-		M_NEWBLK, M_WAITOK);
+		M_NEWBLK, M_SOFTDEP_FLAGS);
 	newblk->nb_state = 0;
 	newblk->nb_fs = fs;
 	newblk->nb_newblkno = newblkno;
@@ -1127,7 +1129,7 @@ bmsafemap_lookup(bp)
 			return (WK_BMSAFEMAP(wk));
 	FREE_LOCK(&lk);
 	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
-		M_BMSAFEMAP, M_WAITOK);
+		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
 	bmsafemap->sm_list.wk_state = 0;
 	bmsafemap->sm_buf = bp;
@@ -1187,7 +1189,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct newblk *newblk;
 
 	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
-		M_ALLOCDIRECT, M_WAITOK);
+		M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
 	bzero(adp, sizeof(struct allocdirect));
 	adp->ad_list.wk_type = D_ALLOCDIRECT;
 	adp->ad_lbn = lbn;
@@ -1339,7 +1341,7 @@ newfreefrag(ip, blkno, size)
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
-		M_FREEFRAG, M_WAITOK);
+		M_FREEFRAG, M_SOFTDEP_FLAGS);
 	freefrag->ff_list.wk_type = D_FREEFRAG;
 	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
 	freefrag->ff_inum = ip->i_number;
@@ -1408,7 +1410,7 @@ newallocindir(ip, ptrno, newblkno, oldblkno)
 	struct allocindir *aip;
 
 	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
-		M_ALLOCINDIR, M_WAITOK);
+		M_ALLOCINDIR, M_SOFTDEP_FLAGS);
 	bzero(aip, sizeof(struct allocindir));
 	aip->ai_list.wk_type = D_ALLOCINDIR;
 	aip->ai_state = ATTACHED;
@@ -1561,7 +1563,7 @@ setup_allocindir_phase2(bp, ip, aip)
 		if (indirdep)
 			break;
 		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
-			M_INDIRDEP, M_WAITOK);
+			M_INDIRDEP, M_SOFTDEP_FLAGS);
 		newindirdep->ir_list.wk_type = D_INDIRDEP;
 		newindirdep->ir_state = ATTACHED;
 		LIST_INIT(&newindirdep->ir_deplisthd);
@@ -1623,7 +1625,7 @@ softdep_setup_freeblocks(ip, length)
 	if (length != 0)
 		panic("softde_setup_freeblocks: non-zero length");
 	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
-		M_FREEBLKS, M_WAITOK);
+		M_FREEBLKS, M_SOFTDEP_FLAGS);
 	bzero(freeblks, sizeof(struct freeblks));
 	freeblks->fb_list.wk_type = D_FREEBLKS;
 	freeblks->fb_uid = ip->i_uid;
@@ -1870,7 +1872,7 @@ softdep_freefile(pvp, ino, mode)
 	 * This sets up the inode de-allocation dependency.
 	 */
 	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
-		M_FREEFILE, M_WAITOK);
+		M_FREEFILE, M_SOFTDEP_FLAGS);
 	freefile->fx_list.wk_type = D_FREEFILE;
 	freefile->fx_list.wk_state = 0;
 	freefile->fx_mode = mode;
@@ -2186,7 +2188,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
 	fs = dp->i_fs;
 	lbn = lblkno(fs, diroffset);
 	offset = blkoff(fs, diroffset);
-	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
+	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS);
 	bzero(dap, sizeof(struct diradd));
 	dap->da_list.wk_type = D_DIRADD;
 	dap->da_offset = offset;
@@ -2198,12 +2200,12 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
 	} else {
 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
-		    M_WAITOK);
+		    M_SOFTDEP_FLAGS);
 		mkdir1->md_list.wk_type = D_MKDIR;
 		mkdir1->md_state = MKDIR_BODY;
 		mkdir1->md_diradd = dap;
 		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
-		    M_WAITOK);
+		    M_SOFTDEP_FLAGS);
 		mkdir2->md_list.wk_type = D_MKDIR;
 		mkdir2->md_state = MKDIR_PARENT;
 		mkdir2->md_diradd = dap;
@@ -2438,7 +2440,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 		(void) request_cleanup(FLUSH_REMOVE, 0);
 	num_dirrem += 1;
 	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
-		M_DIRREM, M_WAITOK);
+		M_DIRREM, M_SOFTDEP_FLAGS);
 	bzero(dirrem, sizeof(struct dirrem));
 	dirrem->dm_list.wk_type = D_DIRREM;
 	dirrem->dm_state = isrmdir ? RMDIR : 0;
@@ -2535,7 +2537,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	 */
 	if (newinum != WINO) {
 		MALLOC(dap, struct diradd *, sizeof(struct diradd),
-		    M_DIRADD, M_WAITOK);
+		    M_DIRADD, M_SOFTDEP_FLAGS);
 		bzero(dap, sizeof(struct diradd));
 		dap->da_list.wk_type = D_DIRADD;
 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
@@ -2841,7 +2843,7 @@ softdep_disk_io_initiation(bp)
 			 * Replace up-to-date version with safe version.
 			 */
 			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
-			    M_INDIRDEP, M_WAITOK);
+			    M_INDIRDEP, M_SOFTDEP_FLAGS);
 			ACQUIRE_LOCK(&lk);
 			indirdep->ir_state &= ~ATTACHED;
 			indirdep->ir_state |= UNDONE;
@@ -2942,7 +2944,7 @@ initiate_write_inodeblock(inodedep, bp)
 		if (inodedep->id_savedino != NULL)
 			panic("initiate_write_inodeblock: already doing I/O");
 		MALLOC(inodedep->id_savedino, struct dinode *,
-		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
+		    sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
 		*inodedep->id_savedino = *dp;
 		bzero((caddr_t)dp, sizeof(struct dinode));
 		return;
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index be43550..785219c 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -48,6 +48,7 @@
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <sys/event.h>
+#include <sys/vmmeter.h>
 
 #define VN_KNOTE(vp, b) \
 	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
@@ -501,6 +502,9 @@ WRITE(ap)
 			} else {
 				bawrite(bp);
 			}
+		} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+			bp->b_flags |= B_CLUSTEROK;
+			bawrite(bp);
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 6a427c9..a625bc8 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -80,6 +80,7 @@
 #include <sys/sysctl.h>
 #include <sys/blist.h>
 #include <sys/lock.h>
+#include <sys/vmmeter.h>
 
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
@@ -1619,10 +1620,11 @@ swp_pager_async_iodone(bp)
 			 * status, then finish the I/O ( which decrements the 
 			 * busy count and possibly wakes waiter's up ).
 			 */
-			vm_page_protect(m, VM_PROT_READ);
 			pmap_clear_modify(m);
 			vm_page_undirty(m);
 			vm_page_io_finish(m);
+			if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
+				vm_page_protect(m, VM_PROT_READ);
 		}
 	}
 
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 038a5ad..9c868fc 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -860,7 +860,7 @@ loop:
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min)
+	if (vm_paging_needed())
 		pagedaemon_wakeup();
 
 	splx(s);
@@ -882,10 +882,10 @@ vm_wait()
 	s = splvm();
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
-		tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
+		tsleep(&vm_pageout_pages_needed, PSWP, "VMWait", 0);
 	} else {
 		if (!vm_pages_needed) {
-			vm_pages_needed++;
+			vm_pages_needed = 1;
 			wakeup(&vm_pages_needed);
 		}
 		tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
@@ -1030,7 +1030,8 @@ vm_page_free_wakeup()
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
-	if (vm_pageout_pages_needed) {
+	if (vm_pageout_pages_needed &&
+	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
@@ -1039,9 +1040,9 @@ vm_page_free_wakeup()
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
-	if (vm_pages_needed && vm_page_count_min()) {
-		wakeup(&cnt.v_free_count);
+	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = 0;
+		wakeup(&cnt.v_free_count);
 	}
 }
 
@@ -1240,6 +1241,9 @@ vm_page_wire(m)
  *	processes.  This optimization causes one-time-use metadata to be
  *	reused more quickly.
  *
+ *	BUT, if we are in a low-memory situation we have no choice but to
+ *	put clean pages on the cache queue.
+ *
  *	A number of routines use vm_page_unwire() to guarantee that the page
  *	will go into either the inactive or active queues, and will NEVER
  *	be placed in the cache - for example, just after dirtying a page.
@@ -1326,6 +1330,25 @@ vm_page_deactivate(vm_page_t m)
 }
 
 /*
+ * vm_page_try_to_cache:
+ *
+ * Returns 0 on failure, 1 on success
+ */
+int
+vm_page_try_to_cache(vm_page_t m)
+{
+	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
+	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+		return(0);
+	}
+	vm_page_test_dirty(m);
+	if (m->dirty)
+		return(0);
+	vm_page_cache(m);
+	return(1);
+}
+
+/*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index cf58985..4c31df9 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -251,6 +251,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_SWAPINPROG	0x0200		/* swap I/O in progress on page	     */
 #define PG_NOSYNC	0x0400		/* do not collect for syncer */
 #define PG_UNMANAGED	0x0800		/* No PV management for page */
+#define PG_MARKER	0x1000		/* special queue marker page */
 
 /*
  * Misc constants.
@@ -403,6 +404,7 @@ void vm_page_activate __P((vm_page_t));
 vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
+int vm_page_try_to_cache __P((vm_page_t));
 void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index d12ecac..4ab3930 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -146,6 +146,7 @@ static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;
 
 static int max_page_launder=100;
+static int vm_pageout_actcmp=0;
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
@@ -189,6 +190,8 @@ SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 
 SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
 	CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
+SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
+	CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");
 
 
 #define VM_PAGEOUT_PAGE_COUNT 16
@@ -372,6 +375,7 @@ vm_pageout_flush(mc, count, flags)
 	 */
 
 	for (i = 0; i < count; i++) {
+		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL && mc[i]->dirty == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially dirty page", mc[i], i, count));
 		vm_page_io_start(mc[i]);
 		vm_page_protect(mc[i], VM_PROT_READ);
 	}
@@ -424,6 +428,8 @@ vm_pageout_flush(mc, count, flags)
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
+			if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
+				vm_page_protect(mt, VM_PROT_READ);
 		}
 	}
 	return numpagedout;
@@ -621,10 +627,10 @@ static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
+	struct vm_page marker;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
-	int launder_loop = 0;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
@@ -646,33 +652,37 @@ vm_pageout_scan()
 
 	/*
 	 * Calculate the number of pages we want to either free or move
-	 * to the cache.
+	 * to the cache.  Be more agressive if we aren't making our target.
 	 */
 
-	page_shortage = vm_paging_target() + addl_page_shortage_init;
+	page_shortage = vm_paging_target() +
+		addl_page_shortage_init + vm_pageout_actcmp;
 
 	/*
-	 * Figure out what to do with dirty pages when they are encountered.
-	 * Assume that 1/3 of the pages on the inactive list are clean.  If
-	 * we think we can reach our target, disable laundering (do not
-	 * clean any dirty pages).  If we miss the target we will loop back
-	 * up and do a laundering run.
+	 * Figure out how agressively we should flush dirty pages.
 	 */
+	{
+		int factor = vm_pageout_actcmp;
 
-	if (cnt.v_inactive_count / 3 > page_shortage) {
-		maxlaunder = 0;
-		launder_loop = 0;
-	} else {
-		maxlaunder = 
-		    (cnt.v_inactive_target > max_page_launder) ?
-		    max_page_launder : cnt.v_inactive_target;
-		launder_loop = 1;
+		maxlaunder = cnt.v_inactive_target / 3 + factor;
+		if (maxlaunder > max_page_launder + factor)
+			maxlaunder = max_page_launder + factor;
 	}
 
 	/*
+	 * Initialize our marker
+	 */
+	bzero(&marker, sizeof(marker));
+	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+	marker.queue = PQ_INACTIVE;
+	marker.wire_count = 1;
+
+	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
-	 * we have scanned the entire inactive queue.
+	 * we have scanned the entire inactive queue.  Note that m->act_count
+	 * is not used to form decisions for the inactive queue, only for the
+	 * active queue.
 	 */
 
 rescan0:
@@ -690,6 +700,12 @@ rescan0:
 
 		next = TAILQ_NEXT(m, pageq);
 
+		/*
+		 * skip marker pages
+		 */
+		if (m->flags & PG_MARKER)
+			continue;
+
 		if (m->hold_count) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
@@ -766,7 +782,8 @@ rescan0:
 			--page_shortage;
 
 		/*
-		 * Clean pages can be placed onto the cache queue.
+		 * Clean pages can be placed onto the cache queue.  This
+		 * effectively frees them.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
@@ -777,7 +794,6 @@ rescan0:
 		 * only a limited number of pages per pagedaemon pass.
 		 */
 		} else if (maxlaunder > 0) {
-			int written;
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp;
@@ -806,29 +822,6 @@ rescan0:
 			}
 
 			/*
-			 * For now we protect against potential memory
-			 * deadlocks by requiring significant memory to be 
-			 * free if the object is not OBJT_DEFAULT or OBJT_SWAP.
-			 * We do not 'trust' any other object type to operate
-			 * with low memory, not even OBJT_DEVICE.  The VM
-			 * allocator will special case allocations done by
-			 * the pageout daemon so the check below actually 
-			 * does have some hysteresis in it.  It isn't the best
-			 * solution, though.
-			 */
-
-			if (object->type != OBJT_DEFAULT &&
-			    object->type != OBJT_SWAP &&
-			    cnt.v_free_count < cnt.v_free_reserved) {
-				s = splvm();
-				TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
-				    pageq);
-				splx(s);
-				continue;
-			}
-
-			/*
 			 * Presumably we have sufficient free memory to do
 			 * the more sophisticated checks and locking required
 			 * for vnodes.
@@ -879,10 +872,15 @@ rescan0:
 				}
 
 				/*
-				 * The page might have been moved to another queue
-				 * during potential blocking in vget() above.
+				 * The page might have been moved to another
+				 * queue during potential blocking in vget()
+				 * above.  The page might have been freed and
+				 * reused for another vnode.  The object might
+				 * have been reused for another vnode.
 				 */
-				if (m->queue != PQ_INACTIVE) {
+				if (m->queue != PQ_INACTIVE ||
+				    m->object != object ||
+				    object->handle != vp) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vput(vp);
@@ -891,9 +889,10 @@ rescan0:
 				}
 	
 				/*
-				 * The page may have been busied during the blocking in
-				 * vput();  We don't move the page back onto the end of
-				 * the queue so that statistics are more correct if we don't.
+				 * The page may have been busied during the
+				 * blocking in vput();  We don't move the
+				 * page back onto the end of the queue so that
+				 * statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->flags & PG_BUSY)) {
 					vput(vp);
@@ -921,42 +920,57 @@ rescan0:
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
-			 * start the cleaning operation.
+			 * start the cleaning operation.  maxlaunder nominally
+			 * counts I/O cost (seeks) rather then bytes.
+			 *
+			 * This operation may cluster, invalidating the 'next'
+			 * pointer.  To prevent an inordinate number of
+			 * restarts we use our marker to remember our place.
 			 */
-			written = vm_pageout_clean(m);
+			s = splvm();
+			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
+			splx(s);
+			if (vm_pageout_clean(m) != 0)
+				--maxlaunder;
+			s = splvm();
+			next = TAILQ_NEXT(&marker, pageq);
+			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
+			splx(s);
 			if (vp) {
 				vput(vp);
 				vn_finished_write(mp);
 			}
-
-			maxlaunder -= written;
 		}
 	}
 
 	/*
-	 * If we still have a page shortage and we didn't launder anything,
-	 * run the inactive scan again and launder something this time.
+	 * If we were not able to meet our target, increase actcmp
 	 */
 
-	if (launder_loop == 0 && page_shortage > 0) {
-		launder_loop = 1;
-		maxlaunder = 
-		    (cnt.v_inactive_target > max_page_launder) ?
-		    max_page_launder : cnt.v_inactive_target;
-		goto rescan0;
+	if (vm_page_count_min()) {
+		if (vm_pageout_actcmp < ACT_MAX / 2)
+			vm_pageout_actcmp += ACT_ADVANCE;
+	} else {
+		if (vm_pageout_actcmp < ACT_DECLINE)
+			vm_pageout_actcmp = 0;
+		else
+			vm_pageout_actcmp -= ACT_DECLINE;
 	}
 
 	/*
-	 * Compute the page shortage from the point of view of having to
-	 * move pages from the active queue to the inactive queue.
+	 * Compute the number of pages we want to try to move from the
+	 * active queue to the inactive queue.
 	 */
 
-	page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
-	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+	page_shortage = vm_paging_target() +
+		cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;
+	page_shortage += vm_pageout_actcmp;
 
 	/*
-	 * Scan the active queue for things we can deactivate
+	 * Scan the active queue for things we can deactivate. We nominally
+	 * track the per-page activity counter and use it to locate
+	 * deactivation candidates.
 	 */
 
 	pcount = cnt.v_active_count;
@@ -1026,7 +1040,8 @@ rescan0:
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			if (vm_pageout_algorithm_lru ||
-				(m->object->ref_count == 0) || (m->act_count == 0)) {
+			    (m->object->ref_count == 0) || 
+			    (m->act_count <= vm_pageout_actcmp)) {
 				page_shortage--;
 				if (m->object->ref_count == 0) {
 					vm_page_protect(m, VM_PROT_NONE);
@@ -1111,7 +1126,7 @@ rescan0:
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
 	 */
-	if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) {
+	if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
 		bigproc = NULL;
 		bigsize = 0;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
@@ -1349,20 +1364,31 @@ vm_pageout()
 		int error;
 		int s = splvm();
 
-		if (vm_pages_needed && vm_page_count_min()) {
+		/*
+		 * If we have enough free memory, wakeup waiters.  Do
+		 * not clear vm_pages_needed until we reach our target,
+		 * otherwise we may be woken up over and over again and
+		 * waste a lot of cpu.
+		 */
+		if (vm_pages_needed && !vm_page_count_min()) {
+			if (vm_paging_needed() <= 0)
+				vm_pages_needed = 0;
+			wakeup(&cnt.v_free_count);
+		}
+		if (vm_pages_needed) {
 			/*
 			 * Still not done, sleep a bit and go again
 			 */
-			vm_pages_needed = 0;
 			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
 		} else {
 			/*
 			 * Good enough, sleep & handle stats
 			 */
-			vm_pages_needed = 0;
 			error = tsleep(&vm_pages_needed,
 				PVM, "psleep", vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
+				if (vm_pageout_actcmp > 0)
+					--vm_pageout_actcmp;
 				splx(s);
 				vm_pageout_page_stats();
 				continue;
@@ -1371,11 +1397,9 @@ vm_pageout()
 
 		if (vm_pages_needed)
 			cnt.v_pdwakeups++;
-		vm_pages_needed = 0;
 		splx(s);
 		vm_pageout_scan();
 		vm_pageout_deficit = 0;
-		wakeup(&cnt.v_free_count);
 	}
 }