6 files changed, 274 insertions, 116 deletions
diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h
index e382d90..2ae45a3 100644
--- a/sys/sys/vmmeter.h
+++ b/sys/sys/vmmeter.h
@@ -91,9 +91,95 @@ struct vmmeter {
 	u_int v_cache_max;	/* max number of pages in cached obj */
 	u_int v_pageout_free_min;   /* min number pages reserved for kernel */
 	u_int v_interrupt_free_min; /* reserved number of pages for int code */
+	u_int v_free_severe;	/* severe depletion of pages below this pt */
 };
 #ifdef KERNEL
+
 extern struct vmmeter cnt;
+
+/*
+ * Return TRUE if we are under our reserved low-free-pages threshold
+ */
+
+static __inline 
+int
+vm_page_count_reserved(void)
+{
+    return (cnt.v_free_reserved > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return TRUE if we are under our severe low-free-pages threshold
+ *
+ * This routine is typically used at the user<->system interface to determine
+ * whether we need to block in order to avoid a low memory deadlock.
+ */
+
+static __inline 
+int
+vm_page_count_severe(void)
+{
+    return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return TRUE if we are under our minimum low-free-pages threshold.
+ *
+ * This routine is typically used within the system to determine whether
+ * we can execute potentially very expensive code in terms of memory.  It
+ * is also used by the pageout daemon to calculate when to sleep, when
+ * to wake waiters up, and when (after making a pass) to become more
+ * desparate.
+ */
+
+static __inline 
+int
+vm_page_count_min(void)
+{
+    return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return TRUE if we have not reached our free page target during
+ * free page recovery operations.
+ */
+
+static __inline 
+int
+vm_page_count_target(void)
+{
+    return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return the number of pages we need to free-up or cache
+ * A positive number indicates that we do not have enough free pages.
+ */
+
+static __inline 
+int
+vm_paging_target(void)
+{
+    return (
+	(cnt.v_free_target + cnt.v_cache_min) - 
+	(cnt.v_free_count + cnt.v_cache_count)
+    );
+}
+
+/*
+ * Return a positive number if the pagedaemon needs to be woken up.
+ */
+
+static __inline 
+int
+vm_paging_needed(void)
+{
+    return (
+	(cnt.v_free_reserved + cnt.v_cache_min) >
+	(cnt.v_free_count + cnt.v_cache_count)
+    );
+}
+
 #endif
 
 /* systemwide totals computed every five seconds */
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index e53079a..1d7157c 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -209,19 +209,9 @@ vm_fork(p1, p2, flags)
 		p1->p_vmspace->vm_refcnt++;
 	}
 
-	/*
-	 * Great, so we have a memory-heavy process and the 
-	 * entire machine comes to a screaching halt because
-	 * nobody can fork/exec anything.  What we really need
-	 * to do is fix the process swapper so it swaps out the right
-	 * processes.
-	 */
-#if 0
-	while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
-		vm_pageout_deficit += (UPAGES + VM_INITIAL_PAGEIN);
+	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
-#endif
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
@@ -339,8 +329,9 @@ scheduler(dummy)
 	int ppri;
 
 loop:
-	while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
+	if (vm_page_count_min()) {
 		VM_WAIT;
+		goto loop;
 	}
 
 	pp = NULL;
diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index 4e7f0fb..6c69562 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@@ -119,6 +119,8 @@ SYSCTL_INT(_vm, VM_V_CACHE_MAX, v_cache_max,
 	CTLFLAG_RW, &cnt.v_cache_max, 0, "");
 SYSCTL_INT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
 	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, v_free_severe,
+	CTLFLAG_RW, &cnt.v_free_severe, 0, "");
 
 SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, 
     &averunnable, loadavg, "Machine loadaverage history");
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index f6db00e..533ba37 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -615,8 +615,7 @@ vm_page_unqueue(m)
 		(*pq->cnt)--;
 		pq->lcnt--;
 		if ((queue - m->pc) == PQ_CACHE) {
-			if ((cnt.v_cache_count + cnt.v_free_count) <
-				(cnt.v_free_reserved + cnt.v_cache_min))
+			if (vm_paging_needed())
 				pagedaemon_wakeup();
 		}
 	}
@@ -871,9 +870,7 @@ loop:
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (((cnt.v_free_count + cnt.v_cache_count) <
-		(cnt.v_free_reserved + cnt.v_cache_min)) ||
-			(cnt.v_free_count < cnt.v_pageout_free_min))
+	if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min)
 		pagedaemon_wakeup();
 
 	splx(s);
@@ -991,6 +988,8 @@ vm_page_asleep(vm_page_t m, char *msg, char *busy) {
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
+ *	Ensure that act_count is at least ACT_INIT but do not otherwise
+ *	mess with it.
  *
  *	The page queues must be locked.
  *	This routine may not block.
@@ -1050,8 +1049,7 @@ vm_page_free_wakeup()
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
-	if (vm_pages_needed &&
-		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
+	if (vm_pages_needed && vm_page_count_min()) {
 		wakeup(&cnt.v_free_count);
 		vm_pages_needed = 0;
 	}
@@ -1261,11 +1259,14 @@ vm_page_unwire(m, activate)
  * Move the specified page to the inactive queue.  If the page has
  * any associated swap, the swap is deallocated.
  *
+ * Normally athead is 0 resulting in LRU operation.  athead is set
+ * to 1 if we want this page to be 'as if it were placed in the cache',
+ * except without unmapping it from the process address space.
+ *
  * This routine may not block.
  */
-void
-vm_page_deactivate(m)
-	register vm_page_t m;
+static __inline void
+_vm_page_deactivate(vm_page_t m, int athead)
 {
 	int s;
 
@@ -1280,7 +1281,10 @@ vm_page_deactivate(m)
 		if ((m->queue - m->pc) == PQ_CACHE)
 			cnt.v_reactivated++;
 		vm_page_unqueue(m);
-		TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
+		if (athead)
+			TAILQ_INSERT_HEAD(&vm_page_queue_inactive, m, pageq);
+		else
+			TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
 		m->queue = PQ_INACTIVE;
 		vm_page_queues[PQ_INACTIVE].lcnt++;
 		cnt.v_inactive_count++;
@@ -1288,6 +1292,12 @@ vm_page_deactivate(m)
 	splx(s);
 }
 
+void
+vm_page_deactivate(vm_page_t m)
+{
+    _vm_page_deactivate(m, 0);
+}
+
 /*
  * vm_page_cache
  *
@@ -1333,6 +1343,70 @@ vm_page_cache(m)
 }
 
 /*
+ * vm_page_dontneed
+ *
+ *	Cache, deactivate, or do nothing as appropriate.  This routine
+ *	is typically used by madvise() MADV_DONTNEED.
+ *
+ *	Generally speaking we want to move the page into the cache so
+ *	it gets reused quickly.  However, this can result in a silly syndrome
+ *	due to the page recycling too quickly.  Small objects will not be
+ *	fully cached.  On the otherhand, if we move the page to the inactive
+ *	queue we wind up with a problem whereby very large objects 
+ *	unnecessarily blow away our inactive and cache queues.
+ *
+ *	The solution is to move the pages based on a fixed weighting.  We
+ *	either leave them alone, deactivate them, or move them to the cache,
+ *	where moving them to the cache has the highest weighting.
+ *	By forcing some pages into other queues we eventually force the
+ *	system to balance the queues, potentially recovering other unrelated
+ *	space from active.  The idea is to not force this to happen too
+ *	often.
+ */
+
+void
+vm_page_dontneed(m)
+	vm_page_t m;
+{
+	static int dnweight;
+	int dnw;
+	int head;
+
+	dnw = ++dnweight;
+
+	/*
+	 * occassionally leave the page alone
+	 */
+
+	if ((dnw & 0x01F0) == 0 ||
+	    m->queue == PQ_INACTIVE || 
+	    m->queue - m->pc == PQ_CACHE
+	) {
+		if (m->act_count >= ACT_INIT)
+			--m->act_count;
+		return;
+	}
+
+	if (m->dirty == 0)
+		vm_page_test_dirty(m);
+
+	if (m->dirty || (dnw & 0x0070) == 0) {
+		/*
+		 * Deactivate the page 3 times out of 32.
+		 */
+		head = 0;
+	} else {
+		/*
+		 * Cache the page 28 times out of every 32.  Note that
+		 * the page is deactivated instead of cached, but placed
+		 * at the head of the queue instead of the tail.
+		 */
+		head = 1;
+	}
+	_vm_page_deactivate(m, head);
+}
+
+/*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, allocate it.
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 6ffb867..2d7e740 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -136,7 +136,8 @@ struct vm_page {
 };
 
 /*
- * note SWAPBLK_NONE is a flag, basically the high bit.
+ * note: currently use SWAPBLK_NONE as an absolute value rather then 
+ * a flag bit.
  */
 
 #define SWAPBLK_MASK	((daddr_t)((u_daddr_t)-1 >> 1))		/* mask */
@@ -391,6 +392,7 @@ void vm_page_activate __P((vm_page_t));
 vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
+void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));
 static __inline void vm_page_free_zero __P((vm_page_t));
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index bc8784c..d24e51c 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -219,7 +219,7 @@ vm_pageout_clean(m)
 	register vm_object_t object;
 	vm_page_t mc[2*vm_pageout_page_count];
 	int pageout_count;
-	int i, forward_okay, backward_okay, page_base;
+	int ib, is, page_base;
 	vm_pindex_t pindex = m->pindex;
 
 	object = m->object;
@@ -243,11 +243,9 @@ vm_pageout_clean(m)
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
-	forward_okay = TRUE;
-	if (pindex != 0)
-		backward_okay = TRUE;
-	else
-		backward_okay = FALSE;
+	ib = 1;
+	is = 1;
+
 	/*
 	 * Scan object for clusterable pages.
 	 *
@@ -258,82 +256,84 @@ vm_pageout_clean(m)
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
+	 *
+	 * During heavy mmap/modification loads the pageout
+	 * daemon can really fragment the underlying file
+	 * due to flushing pages out of order and not trying
+	 * align the clusters (which leave sporatic out-of-order
+	 * holes).  To solve this problem we do the reverse scan
+	 * first and attempt to align our cluster, then do a 
+	 * forward scan if room remains.
 	 */
-	for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) {
+
+more:
+	while (ib && pageout_count < vm_pageout_page_count) {
 		vm_page_t p;
 
-		/*
-		 * See if forward page is clusterable.
-		 */
-		if (forward_okay) {
-			/*
-			 * Stop forward scan at end of object.
-			 */
-			if ((pindex + i) > object->size) {
-				forward_okay = FALSE;
-				goto do_backward;
-			}
-			p = vm_page_lookup(object, pindex + i);
-			if (p) {
-				if (((p->queue - p->pc) == PQ_CACHE) ||
-					(p->flags & PG_BUSY) || p->busy) {
-					forward_okay = FALSE;
-					goto do_backward;
-				}
-				vm_page_test_dirty(p);
-				if ((p->dirty & p->valid) != 0 &&
-				    (p->queue == PQ_INACTIVE) &&
-				    (p->wire_count == 0) &&
-				    (p->hold_count == 0)) {
-					mc[vm_pageout_page_count + i] = p;
-					pageout_count++;
-					if (pageout_count == vm_pageout_page_count)
-						break;
-				} else {
-					forward_okay = FALSE;
-				}
-			} else {
-				forward_okay = FALSE;
-			}
+		if (ib > pindex) {
+			ib = 0;
+			break;
+		}
+
+		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
+			ib = 0;
+			break;
+		}
+		if (((p->queue - p->pc) == PQ_CACHE) ||
+		    (p->flags & PG_BUSY) || p->busy) {
+			ib = 0;
+			break;
+		}
+		vm_page_test_dirty(p);
+		if ((p->dirty & p->valid) == 0 ||
+		    p->queue != PQ_INACTIVE ||
+		    p->wire_count != 0 ||
+		    p->hold_count != 0) {
+			ib = 0;
+			break;
 		}
-do_backward:
+		mc[--page_base] = p;
+		++pageout_count;
+		++ib;
 		/*
-		 * See if backward page is clusterable.
+		 * alignment boundry, stop here and switch directions.  Do
+		 * not clear ib.
 		 */
-		if (backward_okay) {
-			/*
-			 * Stop backward scan at beginning of object.
-			 */
-			if ((pindex - i) == 0) {
-				backward_okay = FALSE;
-			}
-			p = vm_page_lookup(object, pindex - i);
-			if (p) {
-				if (((p->queue - p->pc) == PQ_CACHE) ||
-					(p->flags & PG_BUSY) || p->busy) {
-					backward_okay = FALSE;
-					continue;
-				}
-				vm_page_test_dirty(p);
-				if ((p->dirty & p->valid) != 0 &&
-				    (p->queue == PQ_INACTIVE) &&
-				    (p->wire_count == 0) &&
-				    (p->hold_count == 0)) {
-					mc[vm_pageout_page_count - i] = p;
-					pageout_count++;
-					page_base--;
-					if (pageout_count == vm_pageout_page_count)
-						break;
-				} else {
-					backward_okay = FALSE;
-				}
-			} else {
-				backward_okay = FALSE;
-			}
+		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
+			break;
+	}
+
+	while (pageout_count < vm_pageout_page_count && 
+	    pindex + is < object->size) {
+		vm_page_t p;
+
+		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
+			break;
+		if (((p->queue - p->pc) == PQ_CACHE) ||
+		    (p->flags & PG_BUSY) || p->busy) {
+			break;
 		}
+		vm_page_test_dirty(p);
+		if ((p->dirty & p->valid) == 0 ||
+		    p->queue != PQ_INACTIVE ||
+		    p->wire_count != 0 ||
+		    p->hold_count != 0) {
+			break;
+		}
+		mc[page_base + pageout_count] = p;
+		++pageout_count;
+		++is;
 	}
 
 	/*
+	 * If we exhausted our forward scan, continue with the reverse scan
+	 * when possible, even past a page boundry.  This catches boundry
+	 * conditions.
+	 */
+	if (ib && pageout_count < vm_pageout_page_count)
+		goto more;
+
+	/*
 	 * we allow reads during pageouts...
 	 */
 	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
@@ -397,7 +397,7 @@ vm_pageout_flush(mc, count, flags)
 			 * worked.
 			 */
 			pmap_clear_modify(VM_PAGE_TO_PHYS(mt));
-			mt->dirty = 0;
+			vm_page_undirty(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
@@ -646,9 +646,7 @@ vm_pageout_scan()
 	 * to the cache.
 	 */
 
-	page_shortage = (cnt.v_free_target + cnt.v_cache_min) -
-	    (cnt.v_free_count + cnt.v_cache_count);
-	page_shortage += addl_page_shortage_init;
+	page_shortage = vm_paging_target() + addl_page_shortage_init;
 
 	/*
 	 * Figure out what to do with dirty pages when they are encountered.
@@ -787,7 +785,7 @@ rescan0:
 			} else {
 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
-					(cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min);
+				vm_page_count_min());
 										
 			}
 
@@ -1082,15 +1080,11 @@ rescan0:
 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
 	 * if we did not get enough free pages.
 	 */
-	if ((cnt.v_cache_count + cnt.v_free_count) <
-		(cnt.v_free_target + cnt.v_cache_min) ) {
-		if (vnodes_skipped &&
-		    (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) {
+	if (vm_paging_target() > 0) {
+		if (vnodes_skipped && vm_page_count_min())
 			(void) speedup_syncer();
-		}
 #if !defined(NO_SWAPPING)
-		if (vm_swap_enabled &&
-			(cnt.v_free_count + cnt.v_cache_count < cnt.v_free_target)) {
+		if (vm_swap_enabled && vm_page_count_target()) {
 			vm_req_vmdaemon();
 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
 		}
@@ -1101,8 +1095,7 @@ rescan0:
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
 	 */
-	if ((vm_swap_size == 0 || swap_pager_full) &&
-	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) {
+	if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) {
 		bigproc = NULL;
 		bigsize = 0;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
@@ -1160,8 +1153,10 @@ vm_pageout_page_stats()
 	static int fullintervalcount = 0;
 	int page_shortage;
 
-	page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
+	page_shortage = 
+	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+
 	if (page_shortage <= 0)
 		return;
 
@@ -1253,7 +1248,9 @@ vm_size_t count;
 		cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = vm_pageout_page_count +
 		cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
+	cnt.v_free_severe = cnt.v_free_min / 2;
 	cnt.v_free_min += cnt.v_free_reserved;
+	cnt.v_free_severe += cnt.v_free_reserved;
 	return 1;
 }
 
@@ -1326,8 +1323,17 @@ vm_pageout()
 	while (TRUE) {
 		int error;
 		int s = splvm();
-		if (!vm_pages_needed ||
-			((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_min)) {
+
+		if (vm_pages_needed && vm_page_count_min()) {
+			/*
+			 * Still not done, sleep a bit and go again
+			 */
+			vm_pages_needed = 0;
+			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
+		} else {
+			/*
+			 * Good enough, sleep & handle stats
+			 */
 			vm_pages_needed = 0;
 			error = tsleep(&vm_pages_needed,
 				PVM, "psleep", vm_pageout_stats_interval * hz);
@@ -1336,9 +1342,6 @@ vm_pageout()
 				vm_pageout_page_stats();
 				continue;
 			}
-		} else if (vm_pages_needed) {
-			vm_pages_needed = 0;
-			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
 		}
 
 		if (vm_pages_needed)