Implement a low-memory deadlock solution.

Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
author: dillon <dillon@FreeBSD.org> 2000-11-18 23:06:26 +0000
committer: dillon <dillon@FreeBSD.org> 2000-11-18 23:06:26 +0000
commit: 2ace35208525bb250b47fe7af60ec2ce681c6c92 (patch)
tree: 8b9f3edb21d176840f55c8efbf3c9ffe76fdabc6 /sys
parent: 59e131028ff3997be98ab838d5ab9f965b1589ca (diff)
download: FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.zip
FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.tar.gz
14 files changed, 315 insertions, 185 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 17def1b..9a9aae7 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -597,8 +597,14 @@ bwrite(struct buf * bp)
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
+	 *
+	 * This optimization eats a lot of memory.  If we have a page
+	 * or buffer shortfall we can't do it.
 	 */
-	if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) {
+	if ((bp->b_xflags & BX_BKGRDWRITE) && 
+	    (bp->b_flags & B_ASYNC) &&
+	    !vm_page_count_severe() &&
+	    !buf_dirty_count_severe()) {
 		if (bp->b_iodone != NULL) {
 			printf("bp->b_iodone = %p\n", bp->b_iodone);
 			panic("bwrite: need chained iodone");
@@ -682,7 +688,10 @@ vfs_backgroundwritedone(bp)
 	/*
 	 * Clear the BX_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
+	 * If BX_BKGRDINPROG is not set in the original buffer it must
+	 * have been released and re-instantiated - which is not legal.
 	 */
+	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
 	origbp->b_xflags &= ~BX_BKGRDINPROG;
 	if (origbp->b_xflags & BX_BKGRDWAIT) {
 		origbp->b_xflags &= ~BX_BKGRDWAIT;
@@ -903,6 +912,15 @@ bwillwrite(void)
 }
 
 /*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+	return(numdirtybuffers >= hidirtybuffers);
+}
+
+/*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
@@ -964,10 +982,14 @@ brelse(struct buf * bp)
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
+	 *
+	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+	 * on pages to return pages to the VM page queues.
 	 */
-
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
+	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
+		bp->b_flags |= B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
@@ -989,8 +1011,7 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
-		 (bp->b_flags & B_DELWRI) &&
-		 (bp->b_xflags & BX_BKGRDINPROG))
+		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
@@ -1017,32 +1038,40 @@ brelse(struct buf * bp)
 		 *
 		 * See man buf(9) for more information
 		 */
-
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 
 		for (i = 0; i < bp->b_npages; i++) {
+			int had_bogus = 0;
+
 			m = bp->b_pages[i];
 			vm_page_flag_clear(m, PG_ZERO);
-			if (m == bogus_page) {
 
+			/*
+			 * If we hit a bogus page, fixup *all* the bogus pages
+			 * now.
+			 */
+			if (m == bogus_page) {
 				VOP_GETVOBJECT(vp, &obj);
 				poff = OFF_TO_IDX(bp->b_offset);
+				had_bogus = 1;
 
 				for (j = i; j < bp->b_npages; j++) {
-					m = bp->b_pages[j];
-					if (m == bogus_page) {
-						m = vm_page_lookup(obj, poff + j);
-						if (!m) {
+					vm_page_t mtmp;
+					mtmp = bp->b_pages[j];
+					if (mtmp == bogus_page) {
+						mtmp = vm_page_lookup(obj, poff + j);
+						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
-						bp->b_pages[j] = m;
+						bp->b_pages[j] = mtmp;
 					}
 				}
 
 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 				}
+				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
 				int poffset = foff & PAGE_MASK;
@@ -1051,9 +1080,11 @@ brelse(struct buf * bp)
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_set_invalid(m, poffset, presid);
+				if (had_bogus)
+					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
@@ -1171,7 +1202,7 @@ brelse(struct buf * bp)
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
- * it.
+ * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
@@ -1203,6 +1234,15 @@ bqrelse(struct buf * bp)
 	} else if (bp->b_flags & B_DELWRI) {
 		bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are too low on memory, we have to try to free the
+		 * buffer (most importantly: the wired pages making up its
+		 * backing store) *now*.
+		 */
+		splx(s);
+		brelse(bp);
+		return;
 	} else {
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
@@ -1264,6 +1304,8 @@ vfs_vmio_release(bp)
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
+			} else if (vm_page_count_severe()) {
+				vm_page_try_to_cache(m);
 			}
 		}
 	}
@@ -1419,15 +1461,15 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
-	int isspecial;
 	static int flushingbufs;
 
-	if (curproc != idleproc &&
-	    (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
-		isspecial = 0;
-	else
-		isspecial = 1;
-	
+	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+
 	++getnewbufcalls;
 	--getnewbufrestarts;
 restart:
@@ -1445,42 +1487,28 @@ restart:
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
+	nqindex = QUEUE_EMPTYKVA;
+	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 
-	if (isspecial == 0 && numfreebuffers < lofreebuffers) {
+	if (nbp == NULL) {
 		/*
-		 * This will cause an immediate failure
+		 * If no EMPTYKVA buffers and we are either
+		 * defragging or reusing, locate a CLEAN buffer
+		 * to free or reuse.  If bufspace useage is low
+		 * skip this step so we can allocate a new buffer.
 		 */
-		nqindex = QUEUE_CLEAN;
-		nbp = NULL;
-	} else {
+		if (defrag || bufspace >= lobufspace) {
+			nqindex = QUEUE_CLEAN;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+		}
+
 		/*
-		 * Locate a buffer which already has KVA assigned.  First
-		 * try EMPTYKVA buffers.
+		 * Nada.  If we are allowed to allocate an EMPTY 
+		 * buffer, go get one.
 		 */
-		nqindex = QUEUE_EMPTYKVA;
-		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
-		if (nbp == NULL) {
-			/*
-			 * If no EMPTYKVA buffers and we are either
-			 * defragging or reusing, locate a CLEAN buffer
-			 * to free or reuse.  If bufspace useage is low
-			 * skip this step so we can allocate a new buffer.
-			 */
-			if (defrag || bufspace >= lobufspace) {
-				nqindex = QUEUE_CLEAN;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
-			}
-
-			/*
-			 * Nada.  If we are allowed to allocate an EMPTY 
-			 * buffer, go get one.
-			 */
-			if (nbp == NULL && defrag == 0 && 
-			    (isspecial || bufspace < hibufspace)) {
-				nqindex = QUEUE_EMPTY;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
-			}
+		if (nbp == NULL && defrag == 0 && bufspace < hibufspace) {
+			nqindex = QUEUE_EMPTY;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		}
 	}
 
@@ -1610,26 +1638,16 @@ restart:
 			goto restart;
 		}
 
-		/*
-		 * If we are a normal process then deal with bufspace
-		 * hysteresis.  A normal process tries to keep bufspace
-		 * between lobufspace and hibufspace.  Note: if we encounter
-		 * a buffer with b_kvasize == 0 then it means we started
-		 * our scan on the EMPTY list and should allocate a new
-		 * buffer.
-		 */
-		if (isspecial == 0) {
-			if (bufspace > hibufspace)
-				flushingbufs = 1;
-			if (flushingbufs && bp->b_kvasize != 0) {
-				bp->b_flags |= B_INVAL;
-				bfreekva(bp);
-				brelse(bp);
-				goto restart;
-			}
-			if (bufspace < lobufspace)
-				flushingbufs = 0;
+		if (bufspace >= hibufspace)
+			flushingbufs = 1;
+		if (flushingbufs && bp->b_kvasize != 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
 		}
+		if (bufspace < lobufspace)
+			flushingbufs = 0;
 		break;
 	}
 
@@ -1705,6 +1723,7 @@ restart:
 	return(bp);
 }
 
+#if 0
 /*
  *	waitfreebuffers:
  *
@@ -1723,6 +1742,8 @@ waitfreebuffers(int slpflag, int slptimeo)
 	}
 }
 
+#endif
+
 /*
  *	buf_daemon:
  *
@@ -2073,8 +2094,12 @@ loop:
          * If this check ever becomes a bottleneck it may be better to
          * move it into the else, when gbincore() fails.  At the moment
          * it isn't a problem.
+	 *
+	 * XXX remove if 0 sections (clean this up after its proven)
          */
+#if 0
 	if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
+#endif
 		if (numfreebuffers == 0) {
 			if (curproc == idleproc)
 				return NULL;
@@ -2082,9 +2107,11 @@ loop:
 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
 			    slptimeo);
 		}
+#if 0
 	} else if (numfreebuffers < lofreebuffers) {
 		waitfreebuffers(slpflag, slptimeo);
 	}
+#endif
 
 	if ((bp = gbincore(vp, blkno))) {
 		/*
@@ -2468,7 +2495,13 @@ allocbuf(struct buf *bp, int size)
 
 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
-					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
+					/*
+					 * note: must allocate system pages
+					 * since blocking here could intefere
+					 * with paging I/O, no matter which
+					 * process we are.
+					 */
+					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
 					if (m == NULL) {
 						VM_WAIT;
 						vm_pageout_deficit += desiredpages - bp->b_npages;
@@ -2671,7 +2704,7 @@ bufdone(struct buf *bp)
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
-		int i, resid;
+		int i;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
@@ -2722,16 +2755,29 @@ bufdone(struct buf *bp)
 
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
+			int resid;
+
+			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+			if (resid > iosize)
+				resid = iosize;
+
+			/*
+			 * cleanup bogus pages, restoring the originals
+			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (!m) {
+					panic("biodone: page disappeared!");
 #if defined(VFS_BIO_DEBUG)
 					printf("biodone: page disappeared\n");
 #endif
 					vm_object_pip_subtract(obj, 1);
 					bp->b_flags &= ~B_CACHE;
+					foff = (foff + PAGE_SIZE) &
+					    ~(off_t)PAGE_MASK;
+					iosize -= resid;
 					continue;
 				}
 				bp->b_pages[i] = m;
@@ -2744,9 +2790,6 @@ bufdone(struct buf *bp)
 				    (unsigned long)foff, m->pindex);
 			}
 #endif
-			resid = IDX_TO_OFF(m->pindex + 1) - foff;
-			if (resid > iosize)
-				resid = iosize;
 
 			/*
 			 * In the write case, the valid and clean bits are
@@ -2784,7 +2827,7 @@ bufdone(struct buf *bp)
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
-			foff += resid;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		if (obj)
@@ -2862,7 +2905,7 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 	 * of the buffer.
 	 */
 	soff = off;
-	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
+	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
@@ -2948,7 +2991,7 @@ retry:
 				bp->b_pages[i] = bogus_page;
 				bogus++;
 			}
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		if (bogus)
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
@@ -2976,7 +3019,7 @@ vfs_clean_pages(struct buf * bp)
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
-			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			vm_ooffset_t eoff = noff;
 
 			if (eoff > bp->b_offset + bp->b_bufsize)
@@ -3104,9 +3147,14 @@ vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 
 tryagain:
 
+		/*
+		 * note: must allocate system pages since blocking here
+		 * could intefere with paging I/O, no matter which
+		 * process we are.
+		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
-		    VM_ALLOC_NORMAL);
+		    VM_ALLOC_SYSTEM);
 		if (!p) {
 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 			VM_WAIT;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 4f1aecf..29a1879 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -48,6 +48,7 @@
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
@@ -665,6 +666,11 @@ cluster_write(bp, filesize, seqcount)
 			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are low on memory, get it going NOW
+		 */
+		bawrite(bp);
 	} else {
 		/*
 		 * In the middle of a cluster, so just delay the I/O for now.
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index fd81bc8..cb46c34 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -1438,10 +1438,14 @@ vget(vp, flags, p)
 	if ((flags & LK_INTERLOCK) == 0)
 		mtx_enter(&vp->v_interlock, MTX_DEF);
 	if (vp->v_flag & VXLOCK) {
-		vp->v_flag |= VXWANT;
-		mtx_exit(&vp->v_interlock, MTX_DEF);
-		tsleep((caddr_t)vp, PINOD, "vget", 0);
-		return (ENOENT);
+		if (vp->v_vxproc == curproc) {
+			printf("VXLOCK interlock avoided\n");
+		} else {
+			vp->v_flag |= VXWANT;
+			mtx_exit(&vp->v_interlock, MTX_DEF);
+			tsleep((caddr_t)vp, PINOD, "vget", 0);
+			return (ENOENT);
+		}
 	}
 
 	vp->v_usecount++;
@@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
+	vp->v_vxproc = curproc;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
@@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
+	vp->v_vxproc = NULL;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index fd81bc8..cb46c34 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1438,10 +1438,14 @@ vget(vp, flags, p)
 	if ((flags & LK_INTERLOCK) == 0)
 		mtx_enter(&vp->v_interlock, MTX_DEF);
 	if (vp->v_flag & VXLOCK) {
-		vp->v_flag |= VXWANT;
-		mtx_exit(&vp->v_interlock, MTX_DEF);
-		tsleep((caddr_t)vp, PINOD, "vget", 0);
-		return (ENOENT);
+		if (vp->v_vxproc == curproc) {
+			printf("VXLOCK interlock avoided\n");
+		} else {
+			vp->v_flag |= VXWANT;
+			mtx_exit(&vp->v_interlock, MTX_DEF);
+			tsleep((caddr_t)vp, PINOD, "vget", 0);
+			return (ENOENT);
+		}
 	}
 
 	vp->v_usecount++;
@@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
+	vp->v_vxproc = curproc;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
@@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
+	vp->v_vxproc = NULL;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 448a2a6..b7cea77 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -642,12 +642,14 @@ debug_vn_lock(vp, flags, p, filename, line)
 	do {
 		if ((flags & LK_INTERLOCK) == 0)
 			mtx_enter(&vp->v_interlock, MTX_DEF);
-		if (vp->v_flag & VXLOCK) {
+		if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) {
 			vp->v_flag |= VXWANT;
 			mtx_exit(&vp->v_interlock, MTX_DEF);
 			tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
 			error = ENOENT;
 		} else {
+			if (vp->v_vxproc != NULL)
+				printf("VXLOCK interlock avoided in vn_lock\n");
 #ifdef	DEBUG_LOCKS
 			vp->filename = filename;
 			vp->line = line;
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index d085de6..a10083f 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -494,6 +494,7 @@ struct uio;
 caddr_t bufhashinit __P((caddr_t));
 void	bufinit __P((void));
 void	bwillwrite __P((void));
+int	buf_dirty_count_severe __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 582d00c..75462f6 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -129,6 +129,7 @@ struct vnode {
 		short	vpi_events;		/* what they are looking for */
 		short	vpi_revents;		/* what has happened */
 	} v_pollinfo;
+	struct proc *v_vxproc;			/* proc owning VXLOCK */
 #ifdef	DEBUG_LOCKS
 	const char *filename;			/* Source file doing locking */
 	int line;				/* Line number doing locking */
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 30f36ee7..a8ae464 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -45,6 +45,7 @@
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
 #include <sys/stat.h>
 
 #include <vm/vm.h>
@@ -111,6 +112,8 @@ ffs_update(vp, waitfor)
 	    ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
 	if (waitfor && !DOINGASYNC(vp)) {
 		return (bwrite(bp));
+	} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+		return (bwrite(bp));
 	} else {
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 98ad959..c6ac0bd 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -91,6 +91,8 @@ MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
 MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
 MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
 
+#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
+
 #define	D_PAGEDEP	0
 #define	D_INODEDEP	1
 #define	D_NEWBLK	2
@@ -802,7 +804,7 @@ top:
 		goto top;
 	}
 	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
-		M_WAITOK);
+		M_SOFTDEP_FLAGS);
 	bzero(pagedep, sizeof(struct pagedep));
 	pagedep->pd_list.wk_type = D_PAGEDEP;
 	pagedep->pd_mnt = mp;
@@ -879,7 +881,7 @@ top:
 	}
 	num_inodedep += 1;
 	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
-		M_INODEDEP, M_WAITOK);
+		M_INODEDEP, M_SOFTDEP_FLAGS);
 	inodedep->id_list.wk_type = D_INODEDEP;
 	inodedep->id_fs = fs;
 	inodedep->id_ino = inum;
@@ -941,7 +943,7 @@ top:
 	if (sema_get(&newblk_in_progress, 0) == 0)
 		goto top;
 	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
-		M_NEWBLK, M_WAITOK);
+		M_NEWBLK, M_SOFTDEP_FLAGS);
 	newblk->nb_state = 0;
 	newblk->nb_fs = fs;
 	newblk->nb_newblkno = newblkno;
@@ -1127,7 +1129,7 @@ bmsafemap_lookup(bp)
 			return (WK_BMSAFEMAP(wk));
 	FREE_LOCK(&lk);
 	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
-		M_BMSAFEMAP, M_WAITOK);
+		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
 	bmsafemap->sm_list.wk_state = 0;
 	bmsafemap->sm_buf = bp;
@@ -1187,7 +1189,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct newblk *newblk;
 
 	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
-		M_ALLOCDIRECT, M_WAITOK);
+		M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
 	bzero(adp, sizeof(struct allocdirect));
 	adp->ad_list.wk_type = D_ALLOCDIRECT;
 	adp->ad_lbn = lbn;
@@ -1339,7 +1341,7 @@ newfreefrag(ip, blkno, size)
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
-		M_FREEFRAG, M_WAITOK);
+		M_FREEFRAG, M_SOFTDEP_FLAGS);
 	freefrag->ff_list.wk_type = D_FREEFRAG;
 	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
 	freefrag->ff_inum = ip->i_number;
@@ -1408,7 +1410,7 @@ newallocindir(ip, ptrno, newblkno, oldblkno)
 	struct allocindir *aip;
 
 	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
-		M_ALLOCINDIR, M_WAITOK);
+		M_ALLOCINDIR, M_SOFTDEP_FLAGS);
 	bzero(aip, sizeof(struct allocindir));
 	aip->ai_list.wk_type = D_ALLOCINDIR;
 	aip->ai_state = ATTACHED;
@@ -1561,7 +1563,7 @@ setup_allocindir_phase2(bp, ip, aip)
 		if (indirdep)
 			break;
 		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
-			M_INDIRDEP, M_WAITOK);
+			M_INDIRDEP, M_SOFTDEP_FLAGS);
 		newindirdep->ir_list.wk_type = D_INDIRDEP;
 		newindirdep->ir_state = ATTACHED;
 		LIST_INIT(&newindirdep->ir_deplisthd);
@@ -1623,7 +1625,7 @@ softdep_setup_freeblocks(ip, length)
 	if (length != 0)
 		panic("softde_setup_freeblocks: non-zero length");
 	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
-		M_FREEBLKS, M_WAITOK);
+		M_FREEBLKS, M_SOFTDEP_FLAGS);
 	bzero(freeblks, sizeof(struct freeblks));
 	freeblks->fb_list.wk_type = D_FREEBLKS;
 	freeblks->fb_uid = ip->i_uid;
@@ -1870,7 +1872,7 @@ softdep_freefile(pvp, ino, mode)
 	 * This sets up the inode de-allocation dependency.
 	 */
 	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
-		M_FREEFILE, M_WAITOK);
+		M_FREEFILE, M_SOFTDEP_FLAGS);
 	freefile->fx_list.wk_type = D_FREEFILE;
 	freefile->fx_list.wk_state = 0;
 	freefile->fx_mode = mode;
@@ -2186,7 +2188,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
 	fs = dp->i_fs;
 	lbn = lblkno(fs, diroffset);
 	offset = blkoff(fs, diroffset);
-	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
+	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS);
 	bzero(dap, sizeof(struct diradd));
 	dap->da_list.wk_type = D_DIRADD;
 	dap->da_offset = offset;
@@ -2198,12 +2200,12 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
 	} else {
 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
-		    M_WAITOK);
+		    M_SOFTDEP_FLAGS);
 		mkdir1->md_list.wk_type = D_MKDIR;
 		mkdir1->md_state = MKDIR_BODY;
 		mkdir1->md_diradd = dap;
 		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
-		    M_WAITOK);
+		    M_SOFTDEP_FLAGS);
 		mkdir2->md_list.wk_type = D_MKDIR;
 		mkdir2->md_state = MKDIR_PARENT;
 		mkdir2->md_diradd = dap;
@@ -2438,7 +2440,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 		(void) request_cleanup(FLUSH_REMOVE, 0);
 	num_dirrem += 1;
 	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
-		M_DIRREM, M_WAITOK);
+		M_DIRREM, M_SOFTDEP_FLAGS);
 	bzero(dirrem, sizeof(struct dirrem));
 	dirrem->dm_list.wk_type = D_DIRREM;
 	dirrem->dm_state = isrmdir ? RMDIR : 0;
@@ -2535,7 +2537,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	 */
 	if (newinum != WINO) {
 		MALLOC(dap, struct diradd *, sizeof(struct diradd),
-		    M_DIRADD, M_WAITOK);
+		    M_DIRADD, M_SOFTDEP_FLAGS);
 		bzero(dap, sizeof(struct diradd));
 		dap->da_list.wk_type = D_DIRADD;
 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
@@ -2841,7 +2843,7 @@ softdep_disk_io_initiation(bp)
 			 * Replace up-to-date version with safe version.
 			 */
 			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
-			    M_INDIRDEP, M_WAITOK);
+			    M_INDIRDEP, M_SOFTDEP_FLAGS);
 			ACQUIRE_LOCK(&lk);
 			indirdep->ir_state &= ~ATTACHED;
 			indirdep->ir_state |= UNDONE;
@@ -2942,7 +2944,7 @@ initiate_write_inodeblock(inodedep, bp)
 		if (inodedep->id_savedino != NULL)
 			panic("initiate_write_inodeblock: already doing I/O");
 		MALLOC(inodedep->id_savedino, struct dinode *,
-		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
+		    sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
 		*inodedep->id_savedino = *dp;
 		bzero((caddr_t)dp, sizeof(struct dinode));
 		return;
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index be43550..785219c 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -48,6 +48,7 @@
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <sys/event.h>
+#include <sys/vmmeter.h>
 
 #define VN_KNOTE(vp, b) \
 	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
@@ -501,6 +502,9 @@ WRITE(ap)
 			} else {
 				bawrite(bp);
 			}
+		} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+			bp->b_flags |= B_CLUSTEROK;
+			bawrite(bp);
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 6a427c9..a625bc8 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -80,6 +80,7 @@
 #include <sys/sysctl.h>
 #include <sys/blist.h>
 #include <sys/lock.h>
+#include <sys/vmmeter.h>
 
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
@@ -1619,10 +1620,11 @@ swp_pager_async_iodone(bp)
 			 * status, then finish the I/O ( which decrements the 
 			 * busy count and possibly wakes waiter's up ).
 			 */
-			vm_page_protect(m, VM_PROT_READ);
 			pmap_clear_modify(m);
 			vm_page_undirty(m);
 			vm_page_io_finish(m);
+			if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
+				vm_page_protect(m, VM_PROT_READ);
 		}
 	}
 
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 038a5ad..9c868fc 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -860,7 +860,7 @@ loop:
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min)
+	if (vm_paging_needed())
 		pagedaemon_wakeup();
 
 	splx(s);
@@ -882,10 +882,10 @@ vm_wait()
 	s = splvm();
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
-		tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
+		tsleep(&vm_pageout_pages_needed, PSWP, "VMWait", 0);
 	} else {
 		if (!vm_pages_needed) {
-			vm_pages_needed++;
+			vm_pages_needed = 1;
 			wakeup(&vm_pages_needed);
 		}
 		tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
@@ -1030,7 +1030,8 @@ vm_page_free_wakeup()
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
-	if (vm_pageout_pages_needed) {
+	if (vm_pageout_pages_needed &&
+	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
@@ -1039,9 +1040,9 @@ vm_page_free_wakeup()
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
-	if (vm_pages_needed && vm_page_count_min()) {
-		wakeup(&cnt.v_free_count);
+	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = 0;
+		wakeup(&cnt.v_free_count);
 	}
 }
 
@@ -1240,6 +1241,9 @@ vm_page_wire(m)
  *	processes.  This optimization causes one-time-use metadata to be
  *	reused more quickly.
  *
+ *	BUT, if we are in a low-memory situation we have no choice but to
+ *	put clean pages on the cache queue.
+ *
  *	A number of routines use vm_page_unwire() to guarantee that the page
  *	will go into either the inactive or active queues, and will NEVER
  *	be placed in the cache - for example, just after dirtying a page.
@@ -1326,6 +1330,25 @@ vm_page_deactivate(vm_page_t m)
 }
 
 /*
+ * vm_page_try_to_cache:
+ *
+ * Returns 0 on failure, 1 on success
+ */
+int
+vm_page_try_to_cache(vm_page_t m)
+{
+	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
+	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+		return(0);
+	}
+	vm_page_test_dirty(m);
+	if (m->dirty)
+		return(0);
+	vm_page_cache(m);
+	return(1);
+}
+
+/*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index cf58985..4c31df9 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -251,6 +251,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_SWAPINPROG	0x0200		/* swap I/O in progress on page	     */
 #define PG_NOSYNC	0x0400		/* do not collect for syncer */
 #define PG_UNMANAGED	0x0800		/* No PV management for page */
+#define PG_MARKER	0x1000		/* special queue marker page */
 
 /*
  * Misc constants.
@@ -403,6 +404,7 @@ void vm_page_activate __P((vm_page_t));
 vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
+int vm_page_try_to_cache __P((vm_page_t));
 void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index d12ecac..4ab3930 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -146,6 +146,7 @@ static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;
 
 static int max_page_launder=100;
+static int vm_pageout_actcmp=0;
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
@@ -189,6 +190,8 @@ SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 
 SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
 	CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
+SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
+	CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");
 
 
 #define VM_PAGEOUT_PAGE_COUNT 16
@@ -372,6 +375,7 @@ vm_pageout_flush(mc, count, flags)
 	 */
 
 	for (i = 0; i < count; i++) {
+		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL && mc[i]->dirty == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially dirty page", mc[i], i, count));
 		vm_page_io_start(mc[i]);
 		vm_page_protect(mc[i], VM_PROT_READ);
 	}
@@ -424,6 +428,8 @@ vm_pageout_flush(mc, count, flags)
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
+			if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
+				vm_page_protect(mt, VM_PROT_READ);
 		}
 	}
 	return numpagedout;
@@ -621,10 +627,10 @@ static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
+	struct vm_page marker;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
-	int launder_loop = 0;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
@@ -646,33 +652,37 @@ vm_pageout_scan()
 
 	/*
 	 * Calculate the number of pages we want to either free or move
-	 * to the cache.
+	 * to the cache.  Be more agressive if we aren't making our target.
 	 */
 
-	page_shortage = vm_paging_target() + addl_page_shortage_init;
+	page_shortage = vm_paging_target() +
+		addl_page_shortage_init + vm_pageout_actcmp;
 
 	/*
-	 * Figure out what to do with dirty pages when they are encountered.
-	 * Assume that 1/3 of the pages on the inactive list are clean.  If
-	 * we think we can reach our target, disable laundering (do not
-	 * clean any dirty pages).  If we miss the target we will loop back
-	 * up and do a laundering run.
+	 * Figure out how agressively we should flush dirty pages.
 	 */
+	{
+		int factor = vm_pageout_actcmp;
 
-	if (cnt.v_inactive_count / 3 > page_shortage) {
-		maxlaunder = 0;
-		launder_loop = 0;
-	} else {
-		maxlaunder = 
-		    (cnt.v_inactive_target > max_page_launder) ?
-		    max_page_launder : cnt.v_inactive_target;
-		launder_loop = 1;
+		maxlaunder = cnt.v_inactive_target / 3 + factor;
+		if (maxlaunder > max_page_launder + factor)
+			maxlaunder = max_page_launder + factor;
 	}
 
 	/*
+	 * Initialize our marker
+	 */
+	bzero(&marker, sizeof(marker));
+	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+	marker.queue = PQ_INACTIVE;
+	marker.wire_count = 1;
+
+	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
-	 * we have scanned the entire inactive queue.
+	 * we have scanned the entire inactive queue.  Note that m->act_count
+	 * is not used to form decisions for the inactive queue, only for the
+	 * active queue.
 	 */
 
 rescan0:
@@ -690,6 +700,12 @@ rescan0:
 
 		next = TAILQ_NEXT(m, pageq);
 
+		/*
+		 * skip marker pages
+		 */
+		if (m->flags & PG_MARKER)
+			continue;
+
 		if (m->hold_count) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
@@ -766,7 +782,8 @@ rescan0:
 			--page_shortage;
 
 		/*
-		 * Clean pages can be placed onto the cache queue.
+		 * Clean pages can be placed onto the cache queue.  This
+		 * effectively frees them.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
@@ -777,7 +794,6 @@ rescan0:
 		 * only a limited number of pages per pagedaemon pass.
 		 */
 		} else if (maxlaunder > 0) {
-			int written;
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp;
@@ -806,29 +822,6 @@ rescan0:
 			}
 
 			/*
-			 * For now we protect against potential memory
-			 * deadlocks by requiring significant memory to be 
-			 * free if the object is not OBJT_DEFAULT or OBJT_SWAP.
-			 * We do not 'trust' any other object type to operate
-			 * with low memory, not even OBJT_DEVICE.  The VM
-			 * allocator will special case allocations done by
-			 * the pageout daemon so the check below actually 
-			 * does have some hysteresis in it.  It isn't the best
-			 * solution, though.
-			 */
-
-			if (object->type != OBJT_DEFAULT &&
-			    object->type != OBJT_SWAP &&
-			    cnt.v_free_count < cnt.v_free_reserved) {
-				s = splvm();
-				TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
-				    pageq);
-				splx(s);
-				continue;
-			}
-
-			/*
 			 * Presumably we have sufficient free memory to do
 			 * the more sophisticated checks and locking required
 			 * for vnodes.
@@ -879,10 +872,15 @@ rescan0:
 				}
 
 				/*
-				 * The page might have been moved to another queue
-				 * during potential blocking in vget() above.
+				 * The page might have been moved to another
+				 * queue during potential blocking in vget()
+				 * above.  The page might have been freed and
+				 * reused for another vnode.  The object might
+				 * have been reused for another vnode.
 				 */
-				if (m->queue != PQ_INACTIVE) {
+				if (m->queue != PQ_INACTIVE ||
+				    m->object != object ||
+				    object->handle != vp) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vput(vp);
@@ -891,9 +889,10 @@ rescan0:
 				}
 	
 				/*
-				 * The page may have been busied during the blocking in
-				 * vput();  We don't move the page back onto the end of
-				 * the queue so that statistics are more correct if we don't.
+				 * The page may have been busied during the
+				 * blocking in vput();  We don't move the
+				 * page back onto the end of the queue so that
+				 * statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->flags & PG_BUSY)) {
 					vput(vp);
@@ -921,42 +920,57 @@ rescan0:
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
-			 * start the cleaning operation.
+			 * start the cleaning operation.  maxlaunder nominally
+			 * counts I/O cost (seeks) rather then bytes.
+			 *
+			 * This operation may cluster, invalidating the 'next'
+			 * pointer.  To prevent an inordinate number of
+			 * restarts we use our marker to remember our place.
 			 */
-			written = vm_pageout_clean(m);
+			s = splvm();
+			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
+			splx(s);
+			if (vm_pageout_clean(m) != 0)
+				--maxlaunder;
+			s = splvm();
+			next = TAILQ_NEXT(&marker, pageq);
+			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
+			splx(s);
 			if (vp) {
 				vput(vp);
 				vn_finished_write(mp);
 			}
-
-			maxlaunder -= written;
 		}
 	}
 
 	/*
-	 * If we still have a page shortage and we didn't launder anything,
-	 * run the inactive scan again and launder something this time.
+	 * If we were not able to meet our target, increase actcmp
 	 */
 
-	if (launder_loop == 0 && page_shortage > 0) {
-		launder_loop = 1;
-		maxlaunder = 
-		    (cnt.v_inactive_target > max_page_launder) ?
-		    max_page_launder : cnt.v_inactive_target;
-		goto rescan0;
+	if (vm_page_count_min()) {
+		if (vm_pageout_actcmp < ACT_MAX / 2)
+			vm_pageout_actcmp += ACT_ADVANCE;
+	} else {
+		if (vm_pageout_actcmp < ACT_DECLINE)
+			vm_pageout_actcmp = 0;
+		else
+			vm_pageout_actcmp -= ACT_DECLINE;
 	}
 
 	/*
-	 * Compute the page shortage from the point of view of having to
-	 * move pages from the active queue to the inactive queue.
+	 * Compute the number of pages we want to try to move from the
+	 * active queue to the inactive queue.
 	 */
 
-	page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
-	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+	page_shortage = vm_paging_target() +
+		cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;
+	page_shortage += vm_pageout_actcmp;
 
 	/*
-	 * Scan the active queue for things we can deactivate
+	 * Scan the active queue for things we can deactivate. We nominally
+	 * track the per-page activity counter and use it to locate
+	 * deactivation candidates.
 	 */
 
 	pcount = cnt.v_active_count;
@@ -1026,7 +1040,8 @@ rescan0:
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			if (vm_pageout_algorithm_lru ||
-				(m->object->ref_count == 0) || (m->act_count == 0)) {
+			    (m->object->ref_count == 0) || 
+			    (m->act_count <= vm_pageout_actcmp)) {
 				page_shortage--;
 				if (m->object->ref_count == 0) {
 					vm_page_protect(m, VM_PROT_NONE);
@@ -1111,7 +1126,7 @@ rescan0:
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
 	 */
-	if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) {
+	if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
 		bigproc = NULL;
 		bigsize = 0;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
@@ -1349,20 +1364,31 @@ vm_pageout()
 		int error;
 		int s = splvm();
 
-		if (vm_pages_needed && vm_page_count_min()) {
+		/*
+		 * If we have enough free memory, wakeup waiters.  Do
+		 * not clear vm_pages_needed until we reach our target,
+		 * otherwise we may be woken up over and over again and
+		 * waste a lot of cpu.
+		 */
+		if (vm_pages_needed && !vm_page_count_min()) {
+			if (vm_paging_needed() <= 0)
+				vm_pages_needed = 0;
+			wakeup(&cnt.v_free_count);
+		}
+		if (vm_pages_needed) {
 			/*
 			 * Still not done, sleep a bit and go again
 			 */
-			vm_pages_needed = 0;
 			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
 		} else {
 			/*
 			 * Good enough, sleep & handle stats
 			 */
-			vm_pages_needed = 0;
 			error = tsleep(&vm_pages_needed,
 				PVM, "psleep", vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
+				if (vm_pageout_actcmp > 0)
+					--vm_pageout_actcmp;
 				splx(s);
 				vm_pageout_page_stats();
 				continue;
@@ -1371,11 +1397,9 @@ vm_pageout()
 
 		if (vm_pages_needed)
 			cnt.v_pdwakeups++;
-		vm_pages_needed = 0;
 		splx(s);
 		vm_pageout_scan();
 		vm_pageout_deficit = 0;
-		wakeup(&cnt.v_free_count);
 	}
 }
author	dillon <dillon@FreeBSD.org>	2000-11-18 23:06:26 +0000
committer	dillon <dillon@FreeBSD.org>	2000-11-18 23:06:26 +0000
commit	2ace35208525bb250b47fe7af60ec2ce681c6c92 (patch)
tree	8b9f3edb21d176840f55c8efbf3c9ffe76fdabc6 /sys
parent	59e131028ff3997be98ab838d5ab9f965b1589ca (diff)
download	FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.zip FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.tar.gz