This is a rather large commit that encompasses the new swapper,

changes to the VM system to support the new swapper, VM bug fixes, several VM optimizations, and some additional revamping of the VM code. The specific bug fixes will be documented with additional forced commits. This commit is somewhat rough in regards to code cleanup issues. Reviewed by: "John S. Dyson" <root@dyson.iquest.net>, "David Greenman" <dg@root.com>
author: dillon <dillon@FreeBSD.org> 1999-01-21 08:29:12 +0000
committer: dillon <dillon@FreeBSD.org> 1999-01-21 08:29:12 +0000
commit: df24433bbe29112b4b9c9f38e80ba6cfb6988cb0 (patch)
tree: b0a91cf38166034e837b98d5edacd8177a14aba6
parent: bae5debf723220e076d6a9696e417805639cdc3a (diff)
download: FreeBSD-src-df24433bbe29112b4b9c9f38e80ba6cfb6988cb0.zip
FreeBSD-src-df24433bbe29112b4b9c9f38e80ba6cfb6988cb0.tar.gz
63 files changed, 3198 insertions, 2062 deletions
diff --git a/sys/alpha/alpha/pmap.c b/sys/alpha/alpha/pmap.c
index 0e7aa73..fe8741d 100644
--- a/sys/alpha/alpha/pmap.c
+++ b/sys/alpha/alpha/pmap.c
@@ -43,7 +43,7 @@
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  *	from:	i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp
  *		with some ideas from NetBSD's alpha pmap
- *	$Id: pmap.c,v 1.11 1998/10/21 11:38:06 dg Exp $
+ *	$Id: pmap.c,v 1.12 1998/10/28 13:36:49 dg Exp $
  */
 
 /*
@@ -950,7 +950,7 @@ pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
-	if (m && vm_page_sleep(m, "pplookp", NULL))
+	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
 		goto retry;
 	return m;
 }
@@ -1039,7 +1039,7 @@ pmap_dispose_proc(p)
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 
-		vm_page_flag_set(m, PG_BUSY);
+		vm_page_busy(m);
 
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
@@ -1128,7 +1128,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	int s;
 
-	while (vm_page_sleep(m, "pmuwpt", NULL));
+	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
+		;
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
@@ -1181,7 +1182,7 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
 				wakeup(m);
 			}
 
-			vm_page_flag_set(m, PG_BUSY);
+			vm_page_busy(m);
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
@@ -1316,10 +1317,10 @@ pmap_release_free_page(pmap_t pmap, vm_page_t p)
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
-	if (vm_page_sleep(p, "pmaprl", NULL))
+	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
 		return 0;
 
-	vm_page_flag_set(p, PG_BUSY);
+	vm_page_busy(p);
 
 	/*
 	 * Remove the page table page from the processes address space.
@@ -2336,7 +2337,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				vm_page_flag_set(p, PG_BUSY);
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + alpha_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
@@ -2356,7 +2357,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				vm_page_flag_set(p, PG_BUSY);
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + alpha_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
@@ -2453,7 +2454,7 @@ pmap_prefault(pmap, addra, entry)
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
-			vm_page_flag_set(m, PG_BUSY);
+			vm_page_busy(m);
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
 			vm_page_flag_set(m, PG_MAPPED);
diff --git a/sys/alpha/alpha/symbols.raw b/sys/alpha/alpha/symbols.raw
index bf8881a..2b03da9 100644
--- a/sys/alpha/alpha/symbols.raw
+++ b/sys/alpha/alpha/symbols.raw
@@ -1,6 +1,6 @@
 #	@(#)symbols.raw	7.6 (Berkeley) 5/8/91
 #
-#	$Id: symbols.raw,v 1.12 1998/03/30 09:48:20 phk Exp $
+#	$Id: symbols.raw,v 1.1 1998/06/10 10:53:25 dfr Exp $
 #
 
 
@@ -34,7 +34,8 @@
 #pstat
 #	_cons
 	_nswap
-	_swaplist
+	_swapblist
+#	_swaplist
 #vmstat
 	_cp_time
 #	_rate
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 66c9b63..2a378d3 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -39,7 +39,7 @@
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.218 1999/01/09 21:41:22 dt Exp $
+ *	$Id: pmap.c,v 1.219 1999/01/12 00:17:53 eivind Exp $
  */
 
 /*
@@ -942,7 +942,7 @@ pmap_page_lookup(object, pindex)
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
-	if (m && vm_page_sleep(m, "pplookp", NULL))
+	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
 		goto retry;
 	return m;
 }
@@ -1009,8 +1009,8 @@ pmap_new_proc(p)
 		}
 
 		vm_page_wakeup(m);
-		m->flags &= ~PG_ZERO;
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_clear(m, PG_ZERO);
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 	if (updateneeded)
@@ -1038,7 +1038,7 @@ pmap_dispose_proc(p)
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 
-		m->flags |= PG_BUSY;
+		vm_page_busy(m);
 
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
@@ -1107,7 +1107,7 @@ pmap_swapin_proc(p)
 
 		vm_page_wire(m);
 		vm_page_wakeup(m);
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 	}
 }
 
@@ -1122,7 +1122,8 @@ pmap_swapin_proc(p)
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 
-	while (vm_page_sleep(m, "pmuwpt", NULL));
+	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
+		;
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
@@ -1150,12 +1151,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 		--m->wire_count;
 		if (m->wire_count == 0) {
 
-			if (m->flags & PG_WANTED) {
-				m->flags &= ~PG_WANTED;
-				wakeup(m);
-			}
-
-			m->flags |= PG_BUSY;
+			vm_page_flash(m);
+			vm_page_busy(m);
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
@@ -1257,7 +1254,8 @@ pmap_pinit(pmap)
 	ptdpg->wire_count = 1;
 	++cnt.v_wire_count;
 
-	ptdpg->flags &= ~(PG_MAPPED | PG_BUSY);	/* not mapped normally */
+
+	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
 	ptdpg->valid = VM_PAGE_BITS_ALL;
 
 	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
@@ -1290,10 +1288,10 @@ pmap_release_free_page(pmap, p)
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
-	if (vm_page_sleep(p, "pmaprl", NULL))
+	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
 		return 0;
 
-	p->flags |= PG_BUSY;
+	vm_page_busy(p);
 
 	/*
 	 * Remove the page table page from the processes address space.
@@ -1393,8 +1391,9 @@ _pmap_allocpte(pmap, ptepindex)
 	}
 
 	m->valid = VM_PAGE_BITS_ALL;
-	m->flags &= ~(PG_ZERO | PG_BUSY);
-	m->flags |= PG_MAPPED;
+	vm_page_flag_clear(m, PG_ZERO);
+	vm_page_flag_set(m, PG_MAPPED);
+	vm_page_wakeup(m);
 
 	return m;
 }
@@ -1713,7 +1712,7 @@ pmap_remove_entry(pmap, ppv, va)
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		ppv->pv_list_count--;
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL)
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
@@ -1791,7 +1790,7 @@ pmap_remove_pte(pmap, ptq, va)
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		if (oldpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 		return pmap_remove_entry(pmap, ppv, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
@@ -1976,7 +1975,7 @@ pmap_remove_all(pa)
 			pv->pv_pmap->pm_stats.wired_count--;
 
 		if (tpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
@@ -2005,7 +2004,7 @@ pmap_remove_all(pa)
 		free_pv_entry(pv);
 	}
 
-	ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+	vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 	if (update_needed)
 		invltlb();
@@ -2081,7 +2080,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 				ppv = NULL;
 				if (pbits & PG_A) {
 					ppv = pa_to_pvh(pbits);
-					ppv->pv_vm_page->flags |= PG_REFERENCED;
+					vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if (pbits & PG_M) {
@@ -2436,7 +2435,7 @@ pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
 
 retry:
 		p = vm_page_lookup(object, pindex);
-		if (p && vm_page_sleep(p, "init4p", NULL))
+		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
 			goto retry;
 
 		if (p == NULL) {
@@ -2469,7 +2468,7 @@ retry:
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
-		p->flags |= PG_MAPPED;
+		vm_page_flag_set(p, PG_MAPPED);
 		invltlb();
 		return;
 	}
@@ -2510,11 +2509,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 			objpgs -= 1;
@@ -2531,11 +2530,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 		}
@@ -2628,10 +2627,10 @@ pmap_prefault(pmap, addra, entry)
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
-			m->flags |= PG_BUSY;
+			vm_page_busy(m);
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
-			m->flags |= PG_MAPPED;
+			vm_page_flag_set(m, PG_MAPPED);
 			vm_page_wakeup(m);
 		}
 	}
@@ -3026,7 +3025,7 @@ pmap_remove_pages(pmap, sva, eva)
 		ppv->pv_list_count--;
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
@@ -3406,7 +3405,7 @@ pmap_mincore(pmap, addr)
 		 */
 		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) {
 			val |= MINCORE_REFERENCED_OTHER;
-			m->flags |= PG_REFERENCED;
+			vm_page_flag_set(m, PG_REFERENCED);
 		}
 	} 
 	return val;
diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c
index 57ac533..e4be47f 100644
--- a/sys/cam/cam_periph.c
+++ b/sys/cam/cam_periph.c
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *      $Id: cam_periph.c,v 1.8 1998/12/16 21:00:06 ken Exp $
+ *      $Id: cam_periph.c,v 1.9 1999/01/14 06:21:54 jdp Exp $
  */
 
 #include <sys/param.h>
@@ -599,7 +599,7 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
 		/*
 		 * Get the buffer.
 		 */
-		mapinfo->bp[i] = getpbuf();
+		mapinfo->bp[i] = getpbuf(NULL);
 
 		/* save the buffer's data address */
 		mapinfo->bp[i]->b_saveaddr = mapinfo->bp[i]->b_data;
@@ -674,7 +674,7 @@ cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
 		mapinfo->bp[i]->b_flags &= ~(B_PHYS|B_BUSY);
 
 		/* release the buffer */
-		relpbuf(mapinfo->bp[i]);
+		relpbuf(mapinfo->bp[i], NULL);
 	}
 
 	/* allow ourselves to be swapped once again */
diff --git a/sys/conf/files b/sys/conf/files
index 795f6f8..02a281b 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -306,6 +306,7 @@ kern/subr_module.c	standard
 kern/subr_prf.c		standard
 kern/subr_prof.c	standard
 kern/subr_rlist.c	standard
+kern/subr_blist.c	standard
 kern/subr_scanf.c	standard
 kern/subr_xxx.c		standard
 kern/sys_generic.c	standard
diff --git a/sys/conf/options b/sys/conf/options
index 35ceb1a..6dfc0cc 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -1,4 +1,4 @@
-#	$Id: options,v 1.120 1999/01/17 19:02:39 peter Exp $
+#	$Id: options,v 1.121 1999/01/20 14:49:07 eivind Exp $
 #
 #        On the handling of kernel options   
 # 
@@ -209,6 +209,7 @@ TCPDEBUG
 IPFILTER		opt_ipfilter.h
 IPFILTER_LOG		opt_ipfilter.h
 IPFILTER_LKM		opt_ipfilter.h
+SLIP_IFF_OPTS		opt_slip.h
 
 # ATM (HARP version)
 ATM_CORE		opt_atm.h
diff --git a/sys/fs/procfs/procfs_map.c b/sys/fs/procfs/procfs_map.c
index 4dae10a..c6b8966 100644
--- a/sys/fs/procfs/procfs_map.c
+++ b/sys/fs/procfs/procfs_map.c
@@ -36,7 +36,7 @@
  *
  *	@(#)procfs_status.c	8.3 (Berkeley) 2/17/94
  *
- *	$Id: procfs_map.c,v 1.17 1998/04/29 04:28:22 dyson Exp $
+ *	$Id: procfs_map.c,v 1.18 1998/12/04 22:54:51 archie Exp $
  */
 
 #include <sys/param.h>
@@ -93,7 +93,7 @@ procfs_domap(curp, p, pfs, uio)
 		((uio->uio_resid > 0) && (entry != &map->header));
 		entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
-		int ref_count, shadow_count, id, flags;
+		int ref_count, shadow_count, flags;
 		vm_offset_t addr;
 		int resident, privateresident;
 		char *type;
@@ -139,13 +139,11 @@ case OBJT_DEVICE:
 			flags = obj->flags;
 			ref_count = obj->ref_count;
 			shadow_count = obj->shadow_count;
-			id = obj->id;
 		} else {
 			type = "none";
 			flags = 0;
 			ref_count = 0;
 			shadow_count = 0;
-			id = 0;
 		}
 			
 
@@ -154,9 +152,9 @@ case OBJT_DEVICE:
 		 *  start, end, resident, private resident, cow, access, type.
 		 */
 		snprintf(mebuffer, sizeof(mebuffer),
-		    "0x%x 0x%x %d %d %d %s%s%s %d %d 0x%x %s %s %s\n",
+		    "0x%x 0x%x %d %d %p %s%s%s %d %d 0x%x %s %s %s\n",
 			entry->start, entry->end,
-			resident, privateresident, id,
+			resident, privateresident, obj,
 			(entry->protection & VM_PROT_READ)?"r":"-",
 			(entry->protection & VM_PROT_WRITE)?"w":"-",
 			(entry->protection & VM_PROT_EXECUTE)?"x":"-",
diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c
index ff0f347..6096a1b 100644
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
- * $Id: spec_vnops.c,v 1.77 1998/12/07 21:58:33 archie Exp $
+ * $Id: spec_vnops.c,v 1.78 1998/12/16 00:10:51 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -781,7 +781,7 @@ spec_getpages(ap)
 		blksiz = DEV_BSIZE;
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
@@ -894,13 +894,13 @@ spec_getpages(ap)
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
 
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 66c9b63..2a378d3 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -39,7 +39,7 @@
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.218 1999/01/09 21:41:22 dt Exp $
+ *	$Id: pmap.c,v 1.219 1999/01/12 00:17:53 eivind Exp $
  */
 
 /*
@@ -942,7 +942,7 @@ pmap_page_lookup(object, pindex)
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
-	if (m && vm_page_sleep(m, "pplookp", NULL))
+	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
 		goto retry;
 	return m;
 }
@@ -1009,8 +1009,8 @@ pmap_new_proc(p)
 		}
 
 		vm_page_wakeup(m);
-		m->flags &= ~PG_ZERO;
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_clear(m, PG_ZERO);
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 	if (updateneeded)
@@ -1038,7 +1038,7 @@ pmap_dispose_proc(p)
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 
-		m->flags |= PG_BUSY;
+		vm_page_busy(m);
 
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
@@ -1107,7 +1107,7 @@ pmap_swapin_proc(p)
 
 		vm_page_wire(m);
 		vm_page_wakeup(m);
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 	}
 }
 
@@ -1122,7 +1122,8 @@ pmap_swapin_proc(p)
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 
-	while (vm_page_sleep(m, "pmuwpt", NULL));
+	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
+		;
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
@@ -1150,12 +1151,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 		--m->wire_count;
 		if (m->wire_count == 0) {
 
-			if (m->flags & PG_WANTED) {
-				m->flags &= ~PG_WANTED;
-				wakeup(m);
-			}
-
-			m->flags |= PG_BUSY;
+			vm_page_flash(m);
+			vm_page_busy(m);
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
@@ -1257,7 +1254,8 @@ pmap_pinit(pmap)
 	ptdpg->wire_count = 1;
 	++cnt.v_wire_count;
 
-	ptdpg->flags &= ~(PG_MAPPED | PG_BUSY);	/* not mapped normally */
+
+	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
 	ptdpg->valid = VM_PAGE_BITS_ALL;
 
 	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
@@ -1290,10 +1288,10 @@ pmap_release_free_page(pmap, p)
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
-	if (vm_page_sleep(p, "pmaprl", NULL))
+	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
 		return 0;
 
-	p->flags |= PG_BUSY;
+	vm_page_busy(p);
 
 	/*
 	 * Remove the page table page from the processes address space.
@@ -1393,8 +1391,9 @@ _pmap_allocpte(pmap, ptepindex)
 	}
 
 	m->valid = VM_PAGE_BITS_ALL;
-	m->flags &= ~(PG_ZERO | PG_BUSY);
-	m->flags |= PG_MAPPED;
+	vm_page_flag_clear(m, PG_ZERO);
+	vm_page_flag_set(m, PG_MAPPED);
+	vm_page_wakeup(m);
 
 	return m;
 }
@@ -1713,7 +1712,7 @@ pmap_remove_entry(pmap, ppv, va)
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		ppv->pv_list_count--;
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL)
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
@@ -1791,7 +1790,7 @@ pmap_remove_pte(pmap, ptq, va)
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		if (oldpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 		return pmap_remove_entry(pmap, ppv, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
@@ -1976,7 +1975,7 @@ pmap_remove_all(pa)
 			pv->pv_pmap->pm_stats.wired_count--;
 
 		if (tpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
@@ -2005,7 +2004,7 @@ pmap_remove_all(pa)
 		free_pv_entry(pv);
 	}
 
-	ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+	vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 	if (update_needed)
 		invltlb();
@@ -2081,7 +2080,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 				ppv = NULL;
 				if (pbits & PG_A) {
 					ppv = pa_to_pvh(pbits);
-					ppv->pv_vm_page->flags |= PG_REFERENCED;
+					vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if (pbits & PG_M) {
@@ -2436,7 +2435,7 @@ pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
 
 retry:
 		p = vm_page_lookup(object, pindex);
-		if (p && vm_page_sleep(p, "init4p", NULL))
+		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
 			goto retry;
 
 		if (p == NULL) {
@@ -2469,7 +2468,7 @@ retry:
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
-		p->flags |= PG_MAPPED;
+		vm_page_flag_set(p, PG_MAPPED);
 		invltlb();
 		return;
 	}
@@ -2510,11 +2509,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 			objpgs -= 1;
@@ -2531,11 +2530,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 		}
@@ -2628,10 +2627,10 @@ pmap_prefault(pmap, addra, entry)
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
-			m->flags |= PG_BUSY;
+			vm_page_busy(m);
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
-			m->flags |= PG_MAPPED;
+			vm_page_flag_set(m, PG_MAPPED);
 			vm_page_wakeup(m);
 		}
 	}
@@ -3026,7 +3025,7 @@ pmap_remove_pages(pmap, sva, eva)
 		ppv->pv_list_count--;
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
@@ -3406,7 +3405,7 @@ pmap_mincore(pmap, addr)
 		 */
 		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) {
 			val |= MINCORE_REFERENCED_OTHER;
-			m->flags |= PG_REFERENCED;
+			vm_page_flag_set(m, PG_REFERENCED);
 		}
 	} 
 	return val;
diff --git a/sys/i386/i386/symbols.raw b/sys/i386/i386/symbols.raw
index 4703c30..943d8ae 100644
--- a/sys/i386/i386/symbols.raw
+++ b/sys/i386/i386/symbols.raw
@@ -1,6 +1,6 @@
 #	@(#)symbols.raw	7.6 (Berkeley) 5/8/91
 #
-#	$Id: symbols.raw,v 1.12 1998/03/30 09:48:20 phk Exp $
+#	$Id: symbols.raw,v 1.13 1998/09/15 10:03:43 gibbs Exp $
 #
 
 
@@ -28,7 +28,8 @@
 #pstat
 #	_cons
 	_nswap
-	_swaplist
+	_swapblist
+#	_swaplist
 #vmstat
 	_cp_time
 #	_rate
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index a9776a5..be9f9d3 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
- * $Id: kern_malloc.c,v 1.50 1999/01/08 17:31:09 eivind Exp $
+ * $Id: kern_malloc.c,v 1.51 1999/01/10 01:58:24 eivind Exp $
  */
 
 #include "opt_vm.h"
@@ -101,7 +101,16 @@ struct freelist {
 #endif /* INVARIANTS */
 
 /*
- * Allocate a block of memory
+ *	malloc:
+ *
+ *	Allocate a block of memory.
+ *
+ *	If M_NOWAIT is set, this routine will not block and return NULL if
+ *	the allocation fails.
+ *
+ *	If M_ASLEEP is set (M_NOWAIT must also be set), this routine
+ *	will have the side effect of calling asleep() if it returns NULL,
+ *	allowing the parent to await() at some future time.
  */
 void *
 malloc(size, type, flags)
@@ -122,13 +131,26 @@ malloc(size, type, flags)
 #endif
 	register struct malloc_type *ksp = type;
 
-	if (!type->ks_next)
+	/*
+	 * Must be at splmem() prior to initializing segment to handle
+	 * potential initialization race.
+	 */
+
+	s = splmem();
+
+	if (!type->ks_next) {
 		malloc_init(type);
+	}
 
 	indx = BUCKETINDX(size);
 	kbp = &bucket[indx];
-	s = splmem();
+
 	while (ksp->ks_memuse >= ksp->ks_limit) {
+		if (flags & M_ASLEEP) {
+			if (ksp->ks_limblocks < 65535)
+				ksp->ks_limblocks++;
+			asleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0);
+		}
 		if (flags & M_NOWAIT) {
 			splx(s);
 			return ((void *) NULL);
@@ -239,7 +261,11 @@ out:
 }
 
 /*
- * Free a block of memory allocated by malloc.
+ *	free:
+ *
+ *	Free a block of memory allocated by malloc.
+ *
+ *	This routine may not block.
  */
 void
 free(addr, type)
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
index 441d95f..ad63a98 100644
--- a/sys/kern/kern_physio.c
+++ b/sys/kern/kern_physio.c
@@ -16,7 +16,7 @@
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  *
- * $Id: kern_physio.c,v 1.28 1998/08/19 10:50:32 sos Exp $
+ * $Id: kern_physio.c,v 1.29 1998/10/25 17:44:51 phk Exp $
  */
 
 #include <sys/param.h>
@@ -147,7 +147,7 @@ physio(strategy, bp, dev, rw, minp, uio)
 
 
 doerror:
-	relpbuf(bpa);
+	relpbuf(bpa, NULL);
 	if (!bp_alloc) {
 		bp->b_flags &= ~(B_BUSY|B_PHYS);
 		if( bp->b_flags & B_WANTED) {
@@ -197,13 +197,13 @@ phygetvpbuf(dev_t dev, int resid)
 
 	bdsw = cdevsw[major(dev)];
 	if ((bdsw == NULL) || (bdsw->d_bmaj == -1))
-		return getpbuf();
+		return getpbuf(NULL);
 
 	maxio = bdsw->d_maxio;
 	if (resid > maxio)
 		resid = maxio;
 
-	return getpbuf();
+	return getpbuf(NULL);
 }
 
 static void
diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c
index d637ab4..810b87e 100644
--- a/sys/kern/subr_rlist.c
+++ b/sys/kern/subr_rlist.c
@@ -13,7 +13,7 @@
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This software is a component of "386BSD" developed by
-	William F. Jolitz, TeleMuse.
+ *	William F. Jolitz, TeleMuse.
  * 4. Neither the name of the developer nor the name "386BSD"
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
@@ -54,9 +54,13 @@
  * functioning of this software, nor does the author assume any responsibility
  * for damages incurred with its use.
  *
- *	$Id: subr_rlist.c,v 1.28 1999/01/08 17:31:12 eivind Exp $
+ *	--------- DEPRECIATED ---------
+ *
+ *	$Id: subr_rlist.c,v 1.29 1999/01/10 01:58:25 eivind Exp $
  */
 
+#if 0
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/rlist.h>
@@ -307,3 +311,6 @@ rlist_destroy (rlh)
 		rlist_mfree(lp);
 	}
 }
+
+#endif
+
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index edc74a7..a6c2dfe 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -1,4 +1,4 @@
-/*	$Id: sysv_shm.c,v 1.38 1998/08/24 08:39:38 dfr Exp $ */
+/*	$Id: sysv_shm.c,v 1.39 1998/10/13 08:24:40 dg Exp $ */
 /*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
 
 /*
@@ -52,6 +52,7 @@
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_inherit.h>
 
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 6cc487a..1634681 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
- * $Id: uipc_syscalls.c,v 1.48 1998/12/03 12:35:47 dg Exp $
+ * $Id: uipc_syscalls.c,v 1.49 1998/12/07 21:58:29 archie Exp $
  */
 
 #include "opt_compat.h"
@@ -1543,7 +1543,13 @@ retry_lookup:
 					VM_WAIT;
 					goto retry_lookup;
 				}
-				vm_page_flag_clear(pg, PG_BUSY);
+				/*
+				 * don't just clear PG_BUSY manually -
+				 * vm_page_alloc() should be considered opaque,
+				 * use the VM routine provided to clear
+				 * PG_BUSY.
+				 */
+				vm_page_wakeup(pg);
 			}
 			/*
 			 * Ensure that our page is still around when the I/O completes.
@@ -1583,21 +1589,12 @@ retry_lookup:
 				goto done;
 			}
 		} else {
-			if ((pg->flags & PG_BUSY) || pg->busy)  {
-				s = splvm();
-				if ((pg->flags & PG_BUSY) || pg->busy) {
-					/*
-					 * Page is busy. Wait and retry.
-					 */
-					vm_page_flag_set(pg, PG_WANTED);
-					tsleep(pg, PVM, "sfpbsy", 0);
-					splx(s);
-					goto retry_lookup;
-				}
-				splx(s);
-			}
+			if (vm_page_sleep_busy(pg, TRUE, "sfpbsy"))
+				goto retry_lookup;
+
 			/*
-			 * Protect from having the page ripped out from beneath us.
+			 * Protect from having the page ripped out from 
+			 * beneath us.
 			 */
 			vm_page_wire(pg);
 		}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 93f6164..d528f5e 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
- *	$Id: uipc_usrreq.c,v 1.36 1998/07/15 02:32:12 bde Exp $
+ *	$Id: uipc_usrreq.c,v 1.37 1998/10/25 17:44:51 phk Exp $
  */
 
 #include <sys/param.h>
@@ -1114,8 +1114,11 @@ unp_gc()
 	/* 
 	 * for each FD on our hit list, do the following two things
 	 */
-	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
-		sorflush((struct socket *)(*fpp)->f_data);
+	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
+		struct file *tfp = *fpp;
+		if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
+			sorflush((struct socket *)(tfp->f_data));
+	}
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
 		closef(*fpp, (struct proc *) NULL);
 	free((caddr_t)extra_ref, M_FILE);
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index c7c8aa9..c1af873 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -13,7 +13,7 @@
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  *
- * $Id: vfs_aio.c,v 1.35 1998/11/27 01:14:21 tegge Exp $
+ * $Id: vfs_aio.c,v 1.36 1998/12/15 17:38:33 des Exp $
  */
 
 /*
@@ -386,7 +386,7 @@ aio_free_entry(struct aiocblist *aiocbe)
 		splx(s);
 		if (aiocbe->bp) {
 			vunmapbuf(aiocbe->bp);
-			relpbuf(aiocbe->bp);
+			relpbuf(aiocbe->bp, NULL);
 			aiocbe->bp = NULL;
 		}
 	}
@@ -1035,7 +1035,7 @@ aio_qphysio(p, aiocbe)
 	}
 
 	/* create and build a buffer header for a transfer */
-	bp = (struct buf *)getpbuf();
+	bp = (struct buf *)getpbuf(NULL);
 
 	/*
 	 * get a copy of the kva from the physical buffer
@@ -1122,7 +1122,7 @@ doerror:
 		lj->lioj_buffer_count--;
 	}
 	aiocbe->bp = NULL;
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return error;
 }
 
@@ -1172,7 +1172,7 @@ aio_fphysio(p, iocb, flgwait)
 		error = bp->b_error;
 	}
 
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return (error);
 }
 
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 30018b5..3bb204e 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.192 1999/01/12 11:59:34 eivind Exp $
+ * $Id: vfs_bio.c,v 1.193 1999/01/19 08:00:51 dillon Exp $
  */
 
 /*
@@ -562,7 +562,7 @@ brelse(struct buf * bp)
 	int s;
 
 	if (bp->b_flags & B_CLUSTER) {
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return;
 	}
 
@@ -1364,6 +1364,7 @@ vfs_setdirty(struct buf *bp) {
 				break;
 			}
 		}
+
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 		if (boffset < bp->b_dirtyoff) {
 			bp->b_dirtyoff = max(boffset, 0);
@@ -1412,7 +1413,6 @@ loop:
 
 	if ((bp = gbincore(vp, blkno))) {
 		if (bp->b_flags & B_BUSY) {
-
 			bp->b_flags |= B_WANTED;
 			if (bp->b_usecount < BUF_MAXUSE)
 				++bp->b_usecount;
@@ -1429,16 +1429,13 @@ loop:
 		bremfree(bp);
 
 		/*
-		 * check for size inconsistancies (note that they shouldn't
-		 * happen but do when filesystems don't handle the size changes
-		 * correctly.) We are conservative on metadata and don't just
-		 * extend the buffer but write (if needed) and re-constitute it.
+		 * check for size inconsistancies for non-VMIO case.
 		 */
 
 		if (bp->b_bcount != size) {
-			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
-				allocbuf(bp, size);
-			} else {
+			if ((bp->b_flags & B_VMIO) == 0 ||
+			    (size > bp->b_kvasize)
+			) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					VOP_BWRITE(bp);
@@ -1455,15 +1452,26 @@ loop:
 				goto loop;
 			}
 		}
+
+		/*
+		 * If the size is inconsistant in the VMIO case, we can resize
+		 * the buffer.  This might lead to B_CACHE getting cleared.
+		 */
+
+		if (bp->b_bcount != size)
+			allocbuf(bp, size);
+
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
+
 		/*
 		 * Check that the constituted buffer really deserves for the
 		 * B_CACHE bit to be set.  B_VMIO type buffers might not
 		 * contain fully valid pages.  Normal (old-style) buffers
-		 * should be fully valid.
+		 * should be fully valid.  This might also lead to B_CACHE
+		 * getting clear.
 		 */
-		if (bp->b_flags & B_VMIO) {
+		if ((bp->b_flags & B_VMIO|B_CACHE) == (B_VMIO|B_CACHE)) {
 			int checksize = bp->b_bufsize;
 			int poffset = bp->b_offset & PAGE_MASK;
 			int resid;
@@ -1479,6 +1487,19 @@ loop:
 			}
 		}
 
+		/*
+		 * If B_DELWRI is set and B_CACHE got cleared ( or was
+		 * already clear ), we have to commit the write and
+		 * retry.  The NFS code absolutely depends on this,
+		 * and so might the FFS code.  In anycase, it formalizes
+		 * the B_CACHE rules.  See sys/buf.h.
+		 */
+
+		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			VOP_BWRITE(bp);
+			goto loop;
+		}
+
 		if (bp->b_usecount < BUF_MAXUSE)
 			++bp->b_usecount;
 		splx(s);
@@ -1572,19 +1593,18 @@ geteblk(int size)
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
- * VM object (in the case of VMIO operations).
+ * VM object (in the case of VMIO operations).  This code is able to
+ * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
- * deadlock or inconsistant data situations.  Tread lightly!!!
- *
- * Modify the length of a buffer's underlying buffer storage without
- * destroying information (unless, of course the buffer is shrinking).
+ * deadlock or inconsistant data situations.  Tread lightly!!! 
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
+ * the caller.  Calling this code willy nilly can result in the loss of data.
  */
+
 int
-allocbuf(struct buf * bp, int size)
+allocbuf(struct buf *bp, int size)
 {
-
-	int s;
 	int newbsize, mbsize;
 	int i;
 
@@ -1705,7 +1725,8 @@ allocbuf(struct buf * bp, int size)
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
-					vm_page_sleep(m, "biodep", &m->busy);
+					while (vm_page_sleep_busy(m, TRUE, "biodep"))
+						;
 
 					bp->b_pages[i] = NULL;
 					vm_page_unwire(m, 0);
@@ -1771,16 +1792,25 @@ allocbuf(struct buf * bp, int size)
 						}
 
 						vm_page_wire(m);
-						vm_page_flag_clear(m, PG_BUSY);
+						vm_page_wakeup(m);
 						bp->b_flags &= ~B_CACHE;
 
-					} else if (m->flags & PG_BUSY) {
-						s = splvm();
-						if (m->flags & PG_BUSY) {
-							vm_page_flag_set(m, PG_WANTED);
-							tsleep(m, PVM, "pgtblk", 0);
-						}
-						splx(s);
+					} else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) {
+						/*
+						 *  If we had to sleep, retry.
+						 *
+						 *  Also note that we only test
+						 *  PG_BUSY here, not m->busy.
+						 *  
+						 *  We cannot sleep on m->busy
+						 *  here because a vm_fault ->
+						 *  getpages -> cluster-read ->
+						 *  ...-> allocbuf sequence 
+						 *  will convert PG_BUSY to
+						 *  m->busy so we have to let 
+						 *  m->busy through if we do 
+						 *  not want to deadlock.
+						 */
 						goto doretry;
 					} else {
 						if ((curproc != pageproc) &&
@@ -2010,12 +2040,8 @@ biodone(register struct buf * bp)
 			foff += resid;
 			iosize -= resid;
 		}
-		if (obj &&
-			(obj->paging_in_progress == 0) &&
-		    (obj->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(obj, OBJ_PIPWNT);
-			wakeup(obj);
-		}
+		if (obj)
+			vm_object_pip_wakeupn(obj, 0);
 	}
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
@@ -2096,11 +2122,7 @@ vfs_unbusy_pages(struct buf * bp)
 			vm_page_flag_clear(m, PG_ZERO);
 			vm_page_io_finish(m);
 		}
-		if (obj->paging_in_progress == 0 &&
-		    (obj->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(obj, OBJ_PIPWNT);
-			wakeup(obj);
-		}
+		vm_object_pip_wakeupn(obj, 0);
 	}
 }
 
@@ -2109,6 +2131,8 @@ vfs_unbusy_pages(struct buf * bp)
  * of a page.  If the consumer is not NFS, and the page is not
  * valid for the entire range, clear the B_CACHE flag to force
  * the consumer to re-read the page.
+ *
+ * B_CACHE interaction is especially tricky.
  */
 static void
 vfs_buf_set_valid(struct buf *bp,
@@ -2135,13 +2159,16 @@ vfs_buf_set_valid(struct buf *bp,
 		}
 		evalid = min(evalid, off + size);
 		/*
-		 * Make sure this range is contiguous with the range
-		 * built up from previous pages.  If not, then we will
-		 * just use the range from the previous pages.
+		 * We can only set b_validoff/end if this range is contiguous
+		 * with the range built up already.  If we cannot set
+		 * b_validoff/end, we must clear B_CACHE to force an update
+		 * to clean the bp up.
 		 */
 		if (svalid == bp->b_validend) {
 			bp->b_validoff = min(bp->b_validoff, svalid);
 			bp->b_validend = max(bp->b_validend, evalid);
+		} else {
+			bp->b_flags &= ~B_CACHE;
 		}
 	} else if (!vm_page_is_valid(m,
 				     (vm_offset_t) ((foff + off) & PAGE_MASK),
@@ -2154,6 +2181,10 @@ vfs_buf_set_valid(struct buf *bp,
  * Set the valid bits in a page, taking care of the b_validoff,
  * b_validend fields which NFS uses to optimise small reads.  Off is
  * the offset within the file and pageno is the page index within the buf.
+ *
+ * XXX we have to set the valid & clean bits for all page fragments 
+ * touched by b_validoff/validend, even if the page fragment goes somewhat
+ * beyond b_validoff/validend due to alignment.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
@@ -2208,7 +2239,7 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
 retry:
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
-			if (vm_page_sleep(m, "vbpage", NULL))
+			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 				goto retry;
 		}
 
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index ce842ad..781508e 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.76 1999/01/08 17:31:15 eivind Exp $
+ * $Id: vfs_cluster.c,v 1.77 1999/01/10 01:58:25 eivind Exp $
  */
 
 #include "opt_debug_cluster.h"
@@ -68,6 +68,8 @@ static struct buf *
 
 extern vm_page_t	bogus_page;
 
+extern int cluster_pbuf_freecnt;
+
 /*
  * Maximum number of blocks for read-ahead.
  */
@@ -336,7 +338,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
 		return tbp;
 
-	bp = trypbuf();
+	bp = trypbuf(&cluster_pbuf_freecnt);
 	if (bp == 0)
 		return tbp;
 
@@ -475,7 +477,7 @@ cluster_callback(bp)
 		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
 		biodone(tbp);
 	}
-	relpbuf(bp);
+	relpbuf(bp, &cluster_pbuf_freecnt);
 }
 
 /*
@@ -654,7 +656,7 @@ cluster_wbuild(vp, size, start_lbn, len)
 		  (tbp->b_bcount != tbp->b_bufsize) ||
 		  (tbp->b_bcount != size) ||
 		  (len == 1) ||
-		  ((bp = trypbuf()) == NULL)) {
+		  ((bp = trypbuf(&cluster_pbuf_freecnt)) == NULL)) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 179ef78..44b1698 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.181 1999/01/08 17:31:17 eivind Exp $
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
  */
 
 /*
@@ -63,10 +63,13 @@
 #include <machine/limits.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_zone.h>
@@ -985,6 +988,10 @@ sched_sync(void)
 
 /*
  * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer.  i.e. the bp has not been linked into the vnode or
+ * ref-counted.
  */
 void
 pbgetvp(vp, bp)
@@ -995,6 +1002,7 @@ pbgetvp(vp, bp)
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
+	bp->b_flags |= B_PAGING;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
@@ -1011,7 +1019,34 @@ pbrelvp(bp)
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
+#if !defined(MAX_PERF)
+	/* XXX REMOVE ME */
+	if (bp->b_vnbufs.tqe_next != NULL) {
+		panic(
+		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
+		    bp,
+		    (int)bp->b_flags
+		);
+	}
+#endif
 	bp->b_vp = (struct vnode *) 0;
+	bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+	struct buf *bp;
+	struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_PAGING) == 0) {
+		panic(
+		    "pbreassignbuf() on non phys bp %p", 
+		    bp
+		);
+	}
+#endif
+	bp->b_vp = newvp;
 }
 
 /*
@@ -1034,6 +1069,15 @@ reassignbuf(bp, newvp)
 		return;
 	}
 
+#if !defined(MAX_PERF)
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+#endif
+
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 179ef78..44b1698 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.181 1999/01/08 17:31:17 eivind Exp $
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
  */
 
 /*
@@ -63,10 +63,13 @@
 #include <machine/limits.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_zone.h>
@@ -985,6 +988,10 @@ sched_sync(void)
 
 /*
  * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer.  i.e. the bp has not been linked into the vnode or
+ * ref-counted.
  */
 void
 pbgetvp(vp, bp)
@@ -995,6 +1002,7 @@ pbgetvp(vp, bp)
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
+	bp->b_flags |= B_PAGING;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
@@ -1011,7 +1019,34 @@ pbrelvp(bp)
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
+#if !defined(MAX_PERF)
+	/* XXX REMOVE ME */
+	if (bp->b_vnbufs.tqe_next != NULL) {
+		panic(
+		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
+		    bp,
+		    (int)bp->b_flags
+		);
+	}
+#endif
 	bp->b_vp = (struct vnode *) 0;
+	bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+	struct buf *bp;
+	struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_PAGING) == 0) {
+		panic(
+		    "pbreassignbuf() on non phys bp %p", 
+		    bp
+		);
+	}
+#endif
+	bp->b_vp = newvp;
 }
 
 /*
@@ -1034,6 +1069,15 @@ reassignbuf(bp, newvp)
 		return;
 	}
 
+#if !defined(MAX_PERF)
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+#endif
+
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
diff --git a/sys/miscfs/devfs/devfs_vnops.c b/sys/miscfs/devfs/devfs_vnops.c
index e9bdc2a..56fa842 100644
--- a/sys/miscfs/devfs/devfs_vnops.c
+++ b/sys/miscfs/devfs/devfs_vnops.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- *	$Id: devfs_vnops.c,v 1.64 1998/12/15 23:46:59 eivind Exp $
+ *	$Id: devfs_vnops.c,v 1.65 1999/01/12 11:49:29 eivind Exp $
  */
 
 
@@ -1933,7 +1933,7 @@ devfs_getpages(struct vop_getpages_args *ap)
 		blksiz = DEV_BSIZE;
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
@@ -2042,13 +2042,13 @@ devfs_getpages(struct vop_getpages_args *ap)
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
 
diff --git a/sys/miscfs/procfs/procfs_map.c b/sys/miscfs/procfs/procfs_map.c
index 4dae10a..c6b8966 100644
--- a/sys/miscfs/procfs/procfs_map.c
+++ b/sys/miscfs/procfs/procfs_map.c
@@ -36,7 +36,7 @@
  *
  *	@(#)procfs_status.c	8.3 (Berkeley) 2/17/94
  *
- *	$Id: procfs_map.c,v 1.17 1998/04/29 04:28:22 dyson Exp $
+ *	$Id: procfs_map.c,v 1.18 1998/12/04 22:54:51 archie Exp $
  */
 
 #include <sys/param.h>
@@ -93,7 +93,7 @@ procfs_domap(curp, p, pfs, uio)
 		((uio->uio_resid > 0) && (entry != &map->header));
 		entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
-		int ref_count, shadow_count, id, flags;
+		int ref_count, shadow_count, flags;
 		vm_offset_t addr;
 		int resident, privateresident;
 		char *type;
@@ -139,13 +139,11 @@ case OBJT_DEVICE:
 			flags = obj->flags;
 			ref_count = obj->ref_count;
 			shadow_count = obj->shadow_count;
-			id = obj->id;
 		} else {
 			type = "none";
 			flags = 0;
 			ref_count = 0;
 			shadow_count = 0;
-			id = 0;
 		}
 			
 
@@ -154,9 +152,9 @@ case OBJT_DEVICE:
 		 *  start, end, resident, private resident, cow, access, type.
 		 */
 		snprintf(mebuffer, sizeof(mebuffer),
-		    "0x%x 0x%x %d %d %d %s%s%s %d %d 0x%x %s %s %s\n",
+		    "0x%x 0x%x %d %d %p %s%s%s %d %d 0x%x %s %s %s\n",
 			entry->start, entry->end,
-			resident, privateresident, id,
+			resident, privateresident, obj,
 			(entry->protection & VM_PROT_READ)?"r":"-",
 			(entry->protection & VM_PROT_WRITE)?"w":"-",
 			(entry->protection & VM_PROT_EXECUTE)?"x":"-",
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index ff0f347..6096a1b 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
- * $Id: spec_vnops.c,v 1.77 1998/12/07 21:58:33 archie Exp $
+ * $Id: spec_vnops.c,v 1.78 1998/12/16 00:10:51 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -781,7 +781,7 @@ spec_getpages(ap)
 		blksiz = DEV_BSIZE;
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
@@ -894,13 +894,13 @@ spec_getpages(ap)
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
 
diff --git a/sys/net/if_sl.c b/sys/net/if_sl.c
index 99a6978..151df6e 100644
--- a/sys/net/if_sl.c
+++ b/sys/net/if_sl.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)if_sl.c	8.6 (Berkeley) 2/1/94
- * $Id: if_sl.c,v 1.69 1998/06/07 17:12:05 dfr Exp $
+ * $Id: if_sl.c,v 1.70 1998/07/15 02:32:23 bde Exp $
  */
 
 /*
@@ -70,7 +70,9 @@
 
 #include "bpfilter.h"
 #include "opt_inet.h"
-
+#if !defined(ACTUALLY_LKM_NOT_KERNEL) && !defined(KLD_MODULE)
+#include "opt_slip.h"
+#endif
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
@@ -217,7 +219,11 @@ slattach(dummy)
 		sc->sc_if.if_unit = i++;
 		sc->sc_if.if_mtu = SLMTU;
 		sc->sc_if.if_flags =
-		    IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST;
+#ifdef SLIP_IFF_OPTS
+		    SLIP_IFF_OPTS;
+#else
+		    IFF_BROADCAST | IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST;
+#endif
 		sc->sc_if.if_type = IFT_SLIP;
 		sc->sc_if.if_ioctl = slioctl;
 		sc->sc_if.if_output = sloutput;
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index c973700..fb437a5 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.64 1998/12/07 21:58:43 archie Exp $
+ * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $
  */
 
 
@@ -68,6 +68,7 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
+extern int nfs_pbuf_freecnt;
 extern struct nfsstats nfsstats;
 
 /*
@@ -113,7 +114,7 @@ nfs_getpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
@@ -132,10 +133,16 @@ nfs_getpages(ap)
 	error = nfs_readrpc(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
-	if (error && (uio.uio_resid == count))
+	if (error && (uio.uio_resid == count)) {
+		printf("nfs_getpages: error %d\n", error);
+		for (i = 0; i < npages; ++i) {
+			if (i != ap->a_reqpage)
+				vnode_pager_freepage(pages[i]);
+		}
 		return VM_PAGER_ERROR;
+	}
 
 	size = count - uio.uio_resid;
 
@@ -228,7 +235,7 @@ nfs_putpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
@@ -251,7 +258,7 @@ nfs_putpages(ap)
 	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
 
 	pmap_qremove(kva, npages);
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
@@ -439,6 +446,7 @@ again:
 		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
+
 		/*
 		 * If we are being called from nfs_getpages, we must
 		 * make sure the buffer is a vmio buffer.  The vp will
@@ -779,6 +787,7 @@ again:
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 */
+
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			bp->b_proc = p;
@@ -1254,17 +1263,24 @@ nfs_doio(bp, cr, p)
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
+		 *
+		 * If the buffer is marked B_PAGING, it does not reside on
+		 * the vp's paging queues so we do not ( and cannot ) reassign
+		 * it.  XXX numdirtybuffers should be integrated into 
+		 * reassignbuf() call.
 		 */
     		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
-			++numdirtybuffers;
-			bp->b_flags |= B_DELWRI;
-			s = splbio();
-			reassignbuf(bp, vp);
-			splx(s);
+			if ((bp->b_flags & B_PAGING) == 0) {
+			    ++numdirtybuffers;
+			    bp->b_flags |= B_DELWRI;
+			    s = splbio();
+			    reassignbuf(bp, vp);
+			    splx(s);
+			}
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 	    	} else {
diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfs/nfs_common.c
+++ b/sys/nfs/nfs_common.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfs/nfs_subs.c
+++ b/sys/nfs/nfs_subs.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index c97267a..4131b60 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.115 1998/12/25 10:34:27 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.116 1999/01/12 12:39:14 eivind Exp $
  */
 
 
@@ -2627,14 +2627,17 @@ nfs_strategy(ap)
 
 	if (bp->b_flags & B_PHYS)
 		panic("nfs physio");
+
 	if (bp->b_flags & B_ASYNC)
 		p = (struct proc *)0;
 	else
 		p = curproc;	/* XXX */
+
 	if (bp->b_flags & B_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
+
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index c973700..fb437a5 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.64 1998/12/07 21:58:43 archie Exp $
+ * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $
  */
 
 
@@ -68,6 +68,7 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
+extern int nfs_pbuf_freecnt;
 extern struct nfsstats nfsstats;
 
 /*
@@ -113,7 +114,7 @@ nfs_getpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
@@ -132,10 +133,16 @@ nfs_getpages(ap)
 	error = nfs_readrpc(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
-	if (error && (uio.uio_resid == count))
+	if (error && (uio.uio_resid == count)) {
+		printf("nfs_getpages: error %d\n", error);
+		for (i = 0; i < npages; ++i) {
+			if (i != ap->a_reqpage)
+				vnode_pager_freepage(pages[i]);
+		}
 		return VM_PAGER_ERROR;
+	}
 
 	size = count - uio.uio_resid;
 
@@ -228,7 +235,7 @@ nfs_putpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
@@ -251,7 +258,7 @@ nfs_putpages(ap)
 	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
 
 	pmap_qremove(kva, npages);
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
@@ -439,6 +446,7 @@ again:
 		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
+
 		/*
 		 * If we are being called from nfs_getpages, we must
 		 * make sure the buffer is a vmio buffer.  The vp will
@@ -779,6 +787,7 @@ again:
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 */
+
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			bp->b_proc = p;
@@ -1254,17 +1263,24 @@ nfs_doio(bp, cr, p)
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
+		 *
+		 * If the buffer is marked B_PAGING, it does not reside on
+		 * the vp's paging queues so we do not ( and cannot ) reassign
+		 * it.  XXX numdirtybuffers should be integrated into 
+		 * reassignbuf() call.
 		 */
     		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
-			++numdirtybuffers;
-			bp->b_flags |= B_DELWRI;
-			s = splbio();
-			reassignbuf(bp, vp);
-			splx(s);
+			if ((bp->b_flags & B_PAGING) == 0) {
+			    ++numdirtybuffers;
+			    bp->b_flags |= B_DELWRI;
+			    s = splbio();
+			    reassignbuf(bp, vp);
+			    splx(s);
+			}
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 	    	} else {
diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfsclient/nfs_subs.c
+++ b/sys/nfsclient/nfs_subs.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index c97267a..4131b60 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.115 1998/12/25 10:34:27 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.116 1999/01/12 12:39:14 eivind Exp $
  */
 
 
@@ -2627,14 +2627,17 @@ nfs_strategy(ap)
 
 	if (bp->b_flags & B_PHYS)
 		panic("nfs physio");
+
 	if (bp->b_flags & B_ASYNC)
 		p = (struct proc *)0;
 	else
 		p = curproc;	/* XXX */
+
 	if (bp->b_flags & B_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
+
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfsserver/nfs_srvsubs.c
+++ b/sys/nfsserver/nfs_srvsubs.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index 191fdbc..f2b0f4b 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.60 1998/10/31 14:05:11 peter Exp $
+ * $Id: buf.h,v 1.61 1998/11/13 01:01:44 dg Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -116,7 +116,10 @@ struct buf {
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
-	void	*b_spc;
+	union	pager_info {
+		void	*pg_spc;
+		int	pg_reqpage;
+	} b_pager;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
@@ -126,9 +129,29 @@ struct buf {
 	struct	workhead b_dep;		/* List of filesystem dependencies. */
 };
 
+#define b_spc	b_pager.pg_spc
+
 /*
  * These flags are kept in b_flags.
+ *
+ * Notes:
+ *
+ *	B_ASYNC		VOP calls on bp's are usually async whether or not
+ *			B_ASYNC is set, but some subsystems, such as NFS, like 
+ *			to know what is best for the caller so they can
+ *			optimize the I/O.
+ *
+ *	B_PAGING	Indicates that bp is being used by the paging system or
+ *			some paging system and that the bp is not linked into
+ *			the b_vp's clean/dirty linked lists or ref counts.
+ *			Buffer vp reassignments are illegal in this case.
+ *
+ *	B_CACHE		This may only be set if the buffer is entirely valid.
+ *			The situation where B_DELWRI is set and B_CACHE gets
+ *			cleared MUST be committed to disk so B_DELWRI can
+ *			also be cleared.
  */
+
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
@@ -312,13 +335,12 @@ int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
-struct buf *     getpbuf __P((void));
+struct buf *     getpbuf __P((int *));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
-int	allocbuf __P((struct buf *, int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
@@ -336,13 +358,15 @@ void	vfs_unbusy_pages __P((struct buf *));
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
-void	relpbuf __P((struct buf *));
+void	relpbuf __P((struct buf *, int *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
+int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
-struct	buf *trypbuf __P((void));
+void	bpreassignbuf __P((struct buf *, struct vnode *));
+struct	buf *trypbuf __P((int *));
 void	vfs_bio_need_satisfy __P((void));
 #endif /* KERNEL */
 
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 191fdbc..f2b0f4b 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.60 1998/10/31 14:05:11 peter Exp $
+ * $Id: buf.h,v 1.61 1998/11/13 01:01:44 dg Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -116,7 +116,10 @@ struct buf {
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
-	void	*b_spc;
+	union	pager_info {
+		void	*pg_spc;
+		int	pg_reqpage;
+	} b_pager;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
@@ -126,9 +129,29 @@ struct buf {
 	struct	workhead b_dep;		/* List of filesystem dependencies. */
 };
 
+#define b_spc	b_pager.pg_spc
+
 /*
  * These flags are kept in b_flags.
+ *
+ * Notes:
+ *
+ *	B_ASYNC		VOP calls on bp's are usually async whether or not
+ *			B_ASYNC is set, but some subsystems, such as NFS, like 
+ *			to know what is best for the caller so they can
+ *			optimize the I/O.
+ *
+ *	B_PAGING	Indicates that bp is being used by the paging system or
+ *			some paging system and that the bp is not linked into
+ *			the b_vp's clean/dirty linked lists or ref counts.
+ *			Buffer vp reassignments are illegal in this case.
+ *
+ *	B_CACHE		This may only be set if the buffer is entirely valid.
+ *			The situation where B_DELWRI is set and B_CACHE gets
+ *			cleared MUST be committed to disk so B_DELWRI can
+ *			also be cleared.
  */
+
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
@@ -312,13 +335,12 @@ int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
-struct buf *     getpbuf __P((void));
+struct buf *     getpbuf __P((int *));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
-int	allocbuf __P((struct buf *, int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
@@ -336,13 +358,15 @@ void	vfs_unbusy_pages __P((struct buf *));
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
-void	relpbuf __P((struct buf *));
+void	relpbuf __P((struct buf *, int *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
+int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
-struct	buf *trypbuf __P((void));
+void	bpreassignbuf __P((struct buf *, struct vnode *));
+struct	buf *trypbuf __P((int *));
 void	vfs_bio_need_satisfy __P((void));
 #endif /* KERNEL */
 
diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h
index d8e0cd8..87949b8 100644
--- a/sys/sys/malloc.h
+++ b/sys/sys/malloc.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)malloc.h	8.5 (Berkeley) 5/3/95
- * $Id: malloc.h,v 1.37 1998/03/08 09:58:26 julian Exp $
+ * $Id: malloc.h,v 1.38 1998/11/10 08:46:24 peter Exp $
  */
 
 #ifndef _SYS_MALLOC_H_
@@ -42,11 +42,13 @@
 #define KMEMSTATS
 
 /*
- * flags to malloc
+ * flags to malloc.
  */
+
 #define	M_WAITOK	0x0000
-#define	M_NOWAIT	0x0001
-#define M_KERNEL	0x0002
+#define	M_NOWAIT	0x0001		/* do not block			*/
+#define M_USE_RESERVE	0x0002		/* can alloc out of reserve memory  */
+#define M_ASLEEP	0x0004		/* async sleep on failure	*/
 
 #define	M_MAGIC		877983977	/* time when first defined :-) */
 
diff --git a/sys/sys/param.h b/sys/sys/param.h
index badddca..fb15db3 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
- * $Id: param.h,v 1.37 1998/10/16 04:28:04 jkh Exp $
+ * $Id: param.h,v 1.38 1998/10/16 06:55:07 jkh Exp $
  */
 
 #ifndef _SYS_PARAM_H_
@@ -227,4 +227,10 @@
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
+#define dbtoc(db)			/* calculates devblks to pages */ \
+	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
+ 
+#define ctodb(db)			/* calculates pages to devblks */ \
+	((db) << (PAGE_SHIFT - DEV_BSHIFT))
+
 #endif	/* _SYS_PARAM_H_ */
diff --git a/sys/sys/types.h b/sys/sys/types.h
index 93f8698..c65fe67 100644
--- a/sys/sys/types.h
+++ b/sys/sys/types.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)types.h	8.6 (Berkeley) 2/19/95
- * $Id: types.h,v 1.25 1998/06/07 17:13:05 dfr Exp $
+ * $Id: types.h,v 1.26 1998/12/19 00:02:34 dt Exp $
  */
 
 #ifndef _SYS_TYPES_H_
@@ -68,6 +68,7 @@ typedef	quad_t *	qaddr_t;
 
 typedef	char *		caddr_t;	/* core address */
 typedef	int32_t		daddr_t;	/* disk address */
+typedef	u_int32_t	u_daddr_t;	/* unsigned disk address */
 typedef	u_int32_t	dev_t;		/* device number */
 typedef	u_int32_t	fixpt_t;	/* fixed point number */
 typedef	u_int32_t	gid_t;		/* group id */
diff --git a/sys/ufs/mfs/mfs_extern.h b/sys/ufs/mfs/mfs_extern.h
index ca19cc4..ae5b7af 100644
--- a/sys/ufs/mfs/mfs_extern.h
+++ b/sys/ufs/mfs/mfs_extern.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_extern.h	8.4 (Berkeley) 3/30/95
- * $Id: mfs_extern.h,v 1.10 1997/10/16 10:50:00 phk Exp $
+ * $Id: mfs_extern.h,v 1.11 1998/02/03 21:52:02 bde Exp $
  */
 
 #ifndef _UFS_MFS_MFS_EXTERN_H_
@@ -41,8 +41,9 @@ struct buf;
 struct mount;
 struct proc;
 struct vnode;
+struct mfsnode;
 
-void	mfs_doio __P((struct buf *bp, caddr_t base));
+void	mfs_doio __P((struct buf *bp, struct mfsnode *mfsnode));
 int	mfs_mountfs __P((struct vnode *, struct mount *, struct proc *));
 int	mfs_mountroot __P((void));
 
diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c
index 1ea0804..73ab75a 100644
--- a/sys/ufs/mfs/mfs_vfsops.c
+++ b/sys/ufs/mfs/mfs_vfsops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_vfsops.c	8.11 (Berkeley) 6/19/95
- * $Id: mfs_vfsops.c,v 1.52 1998/12/07 21:58:49 archie Exp $
+ * $Id: mfs_vfsops.c,v 1.53 1999/01/01 04:14:11 dillon Exp $
  */
 
 
@@ -64,8 +64,10 @@ MALLOC_DEFINE(M_MFSNODE, "MFS node", "MFS vnode private part");
 
 u_char *	mfs_getimage __P((void));
 
+#ifdef MFS_ROOT
 static caddr_t	mfs_rootbase;	/* address of mini-root in kernel virtual memory */
 static u_long	mfs_rootsize;	/* size of mini-root in bytes */
+#endif
 
 static	int mfs_minor;	/* used for building internal dev_t */
 
@@ -178,7 +180,9 @@ mfs_mount(mp, path, data, ndp, p)
 	struct mfs_args args;
 	struct ufsmount *ump;
 	struct fs *fs;
+#ifdef MFS_ROOT
 	u_char *base;
+#endif
 	struct mfsnode *mfsp;
 	u_int size;
 	int flags, err;
@@ -344,7 +348,9 @@ mfs_mount(mp, path, data, ndp, p)
 		goto error_2;
 	}
 
+#ifdef MFS_ROOT
 dostatfs:
+#endif
 	/*
 	 * Initialize FS stat information in mount struct; uses both
 	 * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname
@@ -387,11 +393,8 @@ mfs_start(mp, flags, p)
 	register struct vnode *vp = VFSTOUFS(mp)->um_devvp;
 	register struct mfsnode *mfsp = VTOMFS(vp);
 	register struct buf *bp;
-	register caddr_t base;
 	register int gotsig = 0;
 
-	base = mfsp->mfs_baseoff;
-
 	/*
 	 * Must set P_SYSTEM to prevent system from trying to kill
 	 * this process.  What happens is that the process is unkillable,
@@ -402,11 +405,20 @@ mfs_start(mp, flags, p)
 	curproc->p_flag |= P_SYSTEM;
 
 	while (mfsp->mfs_active) {
+		int s;
+
+		s = splbio();
+
 		while (bp = bufq_first(&mfsp->buf_queue)) {
 			bufq_remove(&mfsp->buf_queue, bp);
-			mfs_doio(bp, base);
+			splx(s);
+			mfs_doio(bp, mfsp);
 			wakeup((caddr_t)bp);
+			s = splbio();
 		}
+
+		splx(s);
+
 		/*
 		 * If a non-ignored signal is received, try to unmount.
 		 * If that fails, clear the signal (it has been "processed"),
diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c
index 88cfec6..083843c 100644
--- a/sys/ufs/mfs/mfs_vnops.c
+++ b/sys/ufs/mfs/mfs_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_vnops.c	8.11 (Berkeley) 5/22/95
- * $Id: mfs_vnops.c,v 1.37 1998/07/11 07:46:05 bde Exp $
+ * $Id: mfs_vnops.c,v 1.38 1998/09/07 06:52:01 phk Exp $
  */
 
 #include <sys/param.h>
@@ -41,6 +41,8 @@
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/mman.h>
 
 #include <miscfs/specfs/specdev.h>
 
@@ -51,6 +53,7 @@ static int	mfs_badop __P((struct vop_generic_args *));
 static int	mfs_bmap __P((struct vop_bmap_args *));
 static int	mfs_close __P((struct vop_close_args *));
 static int	mfs_fsync __P((struct vop_fsync_args *));
+static int	mfs_freeblks __P((struct vop_freeblks_args *));
 static int	mfs_inactive __P((struct vop_inactive_args *)); /* XXX */
 static int	mfs_open __P((struct vop_open_args *));
 static int	mfs_reclaim __P((struct vop_reclaim_args *)); /* XXX */
@@ -66,7 +69,7 @@ static struct vnodeopv_entry_desc mfs_vnodeop_entries[] = {
 	{ &vop_bmap_desc,		(vop_t *) mfs_bmap },
 	{ &vop_bwrite_desc,		(vop_t *) vop_defaultop },
 	{ &vop_close_desc,		(vop_t *) mfs_close },
-	{ &vop_freeblks_desc,		(vop_t *) vop_defaultop },
+	{ &vop_freeblks_desc,		(vop_t *) mfs_freeblks },
 	{ &vop_fsync_desc,		(vop_t *) mfs_fsync },
 	{ &vop_getpages_desc,		(vop_t *) mfs_getpages },
 	{ &vop_inactive_desc,		(vop_t *) mfs_inactive },
@@ -119,6 +122,38 @@ mfs_fsync(ap)
 }
 
 /*
+ * mfs_freeblks() - hook to allow us to free physical memory.
+ *
+ *	We implement the B_FREEBUF strategy.  We can't just madvise()
+ *	here because we have to do it in the correct order vs other bio
+ *	requests, so we queue it.
+ */
+
+static int
+mfs_freeblks(ap)
+        struct vop_freeblks_args /* {   
+                struct vnode *a_vp;     
+                daddr_t a_addr;         
+                daddr_t a_length;       
+        } */ *ap;
+{       
+	struct buf *bp;
+	struct vnode *vp;
+
+	if (!vfinddev(ap->a_vp->v_rdev, VBLK, &vp) || vp->v_usecount == 0)
+		panic("mfs_strategy: bad dev");
+
+	bp = geteblk(ap->a_length);
+	bp->b_flags |= B_FREEBUF | B_BUSY;
+	bp->b_dev = ap->a_vp->v_rdev;
+	bp->b_blkno = ap->a_addr;
+	bp->b_offset = dbtob(ap->a_addr);
+	bp->b_bcount = ap->a_length;
+	VOP_STRATEGY(vp, bp);
+	return(0);
+}
+
+/*
  * Pass I/O requests to the memory filesystem process.
  */
 static int
@@ -132,26 +167,50 @@ mfs_strategy(ap)
 	register struct mfsnode *mfsp;
 	struct vnode *vp;
 	struct proc *p = curproc;		/* XXX */
+	int s;
 
 	if (!vfinddev(bp->b_dev, VBLK, &vp) || vp->v_usecount == 0)
 		panic("mfs_strategy: bad dev");
 	mfsp = VTOMFS(vp);
-	/* check for mini-root access */
+
+	/*
+	 * splbio required for queueing/dequeueing, in case of forwarded
+	 * BPs from bio interrupts (??).  It may not be necessary.
+	 */
+
+	s = splbio();
+
 	if (mfsp->mfs_pid == 0) {
+		/*
+		 * mini-root.  Note: B_FREEBUF not supported at the moment,
+		 * I'm not sure what kind of dataspace b_data is in.
+		 */
 		caddr_t base;
 
 		base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT);
+		if (bp->b_flags & B_FREEBUF)
+			;
 		if (bp->b_flags & B_READ)
 			bcopy(base, bp->b_data, bp->b_bcount);
 		else
 			bcopy(bp->b_data, base, bp->b_bcount);
 		biodone(bp);
 	} else if (mfsp->mfs_pid == p->p_pid) {
-		mfs_doio(bp, mfsp->mfs_baseoff);
+		/*
+		 * VOP to self
+		 */
+		splx(s);
+		mfs_doio(bp, mfsp);
+		s = splbio();
 	} else {
+		/*
+		 * VOP from some other process, queue to MFS process and
+		 * wake it up.
+		 */
 		bufq_insert_tail(&mfsp->buf_queue, bp);
 		wakeup((caddr_t)vp);
 	}
+	splx(s);
 	return (0);
 }
 
@@ -159,18 +218,59 @@ mfs_strategy(ap)
  * Memory file system I/O.
  *
  * Trivial on the HP since buffer has already been mapping into KVA space.
+ *
+ * Read and Write are handled with a simple copyin and copyout.    
+ *
+ * We also partially support VOP_FREEBLKS() via B_FREEBUF.  We can't implement
+ * completely -- for example, on fragments or inode metadata, but we can
+ * implement it for page-aligned requests.
  */
 void
-mfs_doio(bp, base)
+mfs_doio(bp, mfsp)
 	register struct buf *bp;
-	caddr_t base;
+	struct mfsnode *mfsp;
 {
+	caddr_t base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT);
+
+	if (bp->b_flags & B_FREEBUF) {
+		/*
+		 * Implement B_FREEBUF, which allows the filesystem to tell
+		 * a block device when blocks are no longer needed (like when
+		 * a file is deleted).  We use the hook to MADV_FREE the VM.
+		 * This makes an MFS filesystem work as well or better then
+		 * a sun-style swap-mounted filesystem.
+		 */
+		int bytes = bp->b_bcount;
+
+		if ((vm_offset_t)base & PAGE_MASK) {
+			int n = PAGE_SIZE - ((vm_offset_t)base & PAGE_MASK);
+			bytes -= n;
+			base += n;
+		}
+                if (bytes > 0) {
+                        struct madvise_args uap;
 
-	base += (bp->b_blkno << DEV_BSHIFT);
-	if (bp->b_flags & B_READ)
+			bytes &= ~PAGE_MASK;
+			if (bytes != 0) {
+				bzero(&uap, sizeof(uap));
+				uap.addr  = base;
+				uap.len   = bytes;
+				uap.behav = MADV_FREE;
+				madvise(curproc, &uap);
+			}
+                }
+		bp->b_error = 0;
+	} else if (bp->b_flags & B_READ) {
+		/*
+		 * Read data from our 'memory' disk
+		 */
 		bp->b_error = copyin(base, bp->b_data, bp->b_bcount);
-	else
+	} else {
+		/*
+		 * Write data to our 'memory' disk
+		 */
 		bp->b_error = copyout(bp->b_data, base, bp->b_bcount);
+	}
 	if (bp->b_error)
 		bp->b_flags |= B_ERROR;
 	biodone(bp);
@@ -222,7 +322,7 @@ mfs_close(ap)
 	 */
 	while (bp = bufq_first(&mfsp->buf_queue)) {
 		bufq_remove(&mfsp->buf_queue, bp);
-		mfs_doio(bp, mfsp->mfs_baseoff);
+		mfs_doio(bp, mfsp);
 		wakeup((caddr_t)bp);
 	}
 	/*
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index 026d3486..fd3555a 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
- * $Id: ufs_readwrite.c,v 1.54 1998/12/15 03:29:52 julian Exp $
+ * $Id: ufs_readwrite.c,v 1.55 1999/01/07 16:14:19 bde Exp $
  */
 
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
@@ -392,7 +392,10 @@ WRITE(ap)
 			panic("%s: nonsync dir write", WRITE_S);
 		break;
 	default:
-		panic("%s: type", WRITE_S);
+		panic("%s: type %p %d (%d,%d)", WRITE_S, vp, (int)vp->v_type,
+			(int)uio->uio_offset,
+			(int)uio->uio_resid
+		);
 	}
 
 	fs = ip->I_FS;
@@ -598,9 +601,8 @@ ffs_getpages(ap)
 				vm_page_busy(m);
 				vm_page_free(m);
 			} else if (m == mreq) {
-				while (m->flags & PG_BUSY) {
-					vm_page_sleep(m, "ffspwt", NULL);
-				}
+				while (vm_page_sleep_busy(m, FALSE, "ffspwt"))
+					;
 				vm_page_busy(m);
 				vp->v_lastr = m->pindex + 1;
 			} else {
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 1010085..49e1a29 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.27 (Berkeley) 5/27/95
- * $Id: ufs_vnops.c,v 1.103 1998/12/24 09:45:10 bde Exp $
+ * $Id: ufs_vnops.c,v 1.104 1999/01/07 16:14:19 bde Exp $
  */
 
 #include "opt_quota.h"
@@ -1731,6 +1731,9 @@ ufs_abortop(ap)
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
+ *
+ * In order to be able to swap to a file, the VOP_BMAP operation may not
+ * deadlock on memory.  See ufs_bmap() for details.
  */
 int
 ufs_strategy(ap)
diff --git a/sys/vm/default_pager.c b/sys/vm/default_pager.c
index ba92894..16b7512 100644
--- a/sys/vm/default_pager.c
+++ b/sys/vm/default_pager.c
@@ -28,7 +28,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: default_pager.c,v 1.15 1998/02/06 12:14:20 eivind Exp $
+ * The default pager is responsible for supplying backing store to unbacked
+ * storage.  The backing store is usually swap so we just fall through to
+ * the swap routines.  However, since swap metadata has not been assigned,
+ * the swap routines assign and manage the swap backing store through the
+ * vm_page->swapblk field.  The object is only converted when the page is 
+ * physically freed after having been cleaned and even then vm_page->swapblk
+ * is maintained whenever a resident page also has swap backing store.
+ *
+ *	$Id: default_pager.c,v 1.16 1998/10/13 08:24:42 dg Exp $
  */
 
 #include <sys/param.h>
@@ -78,6 +86,14 @@ default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 	return vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(round_page(offset + size)));
 }
 
+/*
+ * deallocate resources associated with default objects.   The default objects
+ * have no special resources allocated to them, but the vm_page's being used
+ * in this object might.  Still, we do not have to do anything - we will free
+ * the swapblk in the underlying vm_page's when we free the vm_page or
+ * garbage collect the vm_page cache list.
+ */
+
 static void
 default_pager_dealloc(object)
 	vm_object_t object;
@@ -88,9 +104,11 @@ default_pager_dealloc(object)
 }
 
 /*
- * The default pager has no backing store, so we always return
- * failure.
+ * Load pages from backing store.  Since OBJT_DEFAULT is converted to
+ * OBJT_SWAP at the time a swap-backed vm_page_t is freed, we will never
+ * see a vm_page with assigned swap here.
  */
+
 static int
 default_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
@@ -101,6 +119,13 @@ default_pager_getpages(object, m, count, reqpage)
 	return VM_PAGER_FAIL;
 }
 
+/*
+ * Store pages to backing store.  We should assign swap and initiate
+ * I/O.  We do not actually convert the object to OBJT_SWAP here.  The
+ * object will be converted when the written-out vm_page_t is moved from the
+ * cache to the free list.
+ */
+
 static int
 default_pager_putpages(object, m, c, sync, rtvals)
 	vm_object_t object;
@@ -109,26 +134,22 @@ default_pager_putpages(object, m, c, sync, rtvals)
 	boolean_t sync;
 	int *rtvals;
 {
-	int i;
-
-	/*
-	 * Try to convert the object type into a OBJT_SWAP.
-	 * If the swp structure allocation fails, convert it
-	 * back to OBJT_DEFAULT and return failure. Otherwise
-	 * pass this putpages to the swap pager.
-	 */
-	object->type = OBJT_SWAP;
-
-	if (swap_pager_swp_alloc(object, M_KERNEL) != 0) {
-		object->type = OBJT_DEFAULT;
-		for (i = 0; i < c; i++)
-			rtvals[i] = VM_PAGER_FAIL;
-		return VM_PAGER_FAIL;
-	}
-
 	return swap_pager_putpages(object, m, c, sync, rtvals);
 }
 
+/*
+ * Tell us whether the backing store for the requested (object,index) is
+ * synchronized.  i.e. tell us whether we can throw the page away and 
+ * reload it later.  So, for example, if we are in the process of writing
+ * the page to its backing store, or if no backing store has been assigned,
+ * it is not yet synchronized.
+ *
+ * It is possible to have fully-synchronized swap assigned without the
+ * object having been converted.  We just call swap_pager_haspage() to
+ * deal with it since it must already deal with it plus deal with swap
+ * meta-data structures.
+ */
+
 static boolean_t
 default_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
@@ -139,24 +160,3 @@ default_pager_haspage(object, pindex, before, after)
 	return FALSE;
 }
 
-void
-default_pager_convert_to_swap(object)
-	vm_object_t object;
-{
-	object->type = OBJT_SWAP;
-	if (swap_pager_swp_alloc(object, M_KERNEL) != 0) {
-		object->type = OBJT_DEFAULT;
-	}
-}
-
-void
-default_pager_convert_to_swapq(object)
-	vm_object_t object;
-{
-	if (object &&
-		(object->type == OBJT_DEFAULT) &&
-		(object != kernel_object && object != kmem_object) &&
-		(object->size > ((cnt.v_page_count - cnt.v_wire_count) / 4)))
-		default_pager_convert_to_swap(object);
-}
-
diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c
index a200b9c..cc742b0 100644
--- a/sys/vm/device_pager.c
+++ b/sys/vm/device_pager.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)device_pager.c	8.1 (Berkeley) 6/11/93
- * $Id: device_pager.c,v 1.36 1998/12/07 21:58:50 archie Exp $
+ * $Id: device_pager.c,v 1.37 1999/01/08 17:31:23 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -200,7 +200,7 @@ dev_pager_getpages(object, m, count, reqpage)
 	int prot;
 
 	dev = (dev_t) (uintptr_t) object->handle;
-	offset = m[reqpage]->pindex + OFF_TO_IDX(object->paging_offset);
+	offset = m[reqpage]->pindex;
 	prot = PROT_READ;	/* XXX should pass in? */
 	mapfunc = cdevsw[major(dev)]->d_mmap;
 
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 1691168..b063520 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
@@ -36,17 +37,34 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
+ *				New Swap System
+ *				Matthew Dillon
+ *
+ * Radix Bitmap 'blists'.
+ *
+ *	- The new swapper uses the new radix bitmap code.  This should scale
+ *	  to arbitrarily small or arbitrarily large swap spaces and an almost
+ *	  arbitrary degree of fragmentation.
+ *
+ * Features:
+ *
+ *	- on the fly reallocation of swap during putpages.  The new system
+ *	  does not try to keep previously allocated swap blocks for dirty
+ *	  pages.  
+ *
+ *	- on the fly deallocation of swap
+ *
+ *	- No more garbage collection required.  Unnecessarily allocated swap
+ *	  blocks only exist for dirty vm_page_t's now and these are already
+ *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
+ *	  removal of invalidated swap blocks when a page is destroyed
+ *	  or renamed.
+ *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
- * $Id: swap_pager.c,v 1.106 1999/01/08 17:31:23 eivind Exp $
- */
-
-/*
- * Quick hack to page to dedicated partition(s).
- * TODO:
- *	Add multiprocessor locks
- *	Deal with async writes in a better fashion
+ *
+ * $Id: swap_pager.c,v 1.107 1999/01/10 01:58:28 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -57,18 +75,16 @@
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/vmmeter.h>
-#include <sys/rlist.h>
+#include <sys/blist.h>
+#include <sys/lock.h>
 
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
 #endif
 
-#ifndef NPENDINGIO
-#define NPENDINGIO	16
-#endif
-
-#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
+#define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 
+#include "opt_swap.h"
 #include <vm/vm.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
@@ -77,848 +93,651 @@
 #include <vm/vm_pageout.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
 
-static int nswiodone;
-int swap_pager_full;
-extern int vm_swap_size;
-static int no_swap_space = 1;
-static int max_pageout_cluster;
-struct rlisthdr swaplist;
-
-TAILQ_HEAD(swpclean, swpagerclean);
-
-typedef struct swpagerclean *swp_clean_t;
+#define SWM_FREE	0x02	/* free, period			*/
+#define SWM_POP		0x04	/* pop out			*/
 
-static struct swpagerclean {
-	TAILQ_ENTRY(swpagerclean) spc_list;
-	int spc_flags;
-	struct buf *spc_bp;
-	vm_object_t spc_object;
-	vm_offset_t spc_kva;
-	int spc_first;
-	int spc_count;
-	vm_page_t spc_m[MAX_PAGEOUT_CLUSTER];
-} swcleanlist[NPENDINGIO];
-
-
-/* spc_flags values */
-#define SPC_ERROR	0x01
+/*
+ * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
+ * in the old system.
+ */
 
-#define SWB_EMPTY (-1)
+extern int vm_swap_size;	/* number of free swap blocks, in pages */
 
-/* list of completed page cleans */
-static struct swpclean swap_pager_done;
+int swap_pager_full;		/* swap space exhaustion (w/ hysteresis)*/
+static int nsw_rcount;		/* free read buffers			*/
+static int nsw_wcount;		/* free write buffers			*/
+static int nsw_hysteresis;	/* hysteresis				*/
+static int max_pageout_cluster;	/* maximum VOP I/O allowed		*/
+static int sw_alloc_interlock;	/* swap pager allocation interlock	*/
 
-/* list of pending page cleans */
-static struct swpclean swap_pager_inuse;
+struct blist *swapblist;
+static struct swblock **swhash;
+static int swhash_mask;
 
-/* list of free pager clean structs */
-static struct swpclean swap_pager_free;
-static int swap_pager_free_count;
-static int swap_pager_free_pending;
 
-/* list of "named" anon region objects */
-static struct pagerlst swap_pager_object_list;
+/*
+ * "named" and "unnamed" anon region objects.  Try to reduce the overhead
+ * of searching a named list by hashing it just a little.
+ */
 
-/* list of "unnamed" anon region objects */
-struct pagerlst swap_pager_un_object_list;
+#define NOBJLISTS		8
 
-#define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
-#define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2
-static int swap_pager_needflags;
+#define NOBJLIST(handle)	\
+	(&swap_pager_object_list[((int)(long)handle >> 4) & (NOBJLISTS-1)])
 
-static struct pagerlst *swp_qs[] = {
-	&swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0
-};
+static struct pagerlst	swap_pager_object_list[NOBJLISTS];
+struct pagerlst		swap_pager_un_object_list;
+vm_zone_t		swap_zone;
 
 /*
- * pagerops for OBJT_SWAP - "swap pager".
+ * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
+ * calls hooked from other parts of the VM system and do not appear here.
+ * (see vm/swap_pager.h).
  */
+
 static vm_object_t
 		swap_pager_alloc __P((void *handle, vm_ooffset_t size,
 				      vm_prot_t prot, vm_ooffset_t offset));
 static void	swap_pager_dealloc __P((vm_object_t object));
-static boolean_t
-		swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex,
-					int *before, int *after));
 static int	swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static void	swap_pager_init __P((void));
-static void spc_free __P((swp_clean_t));
+static void	swap_pager_unswapped __P((vm_page_t));
 
 struct pagerops swappagerops = {
-	swap_pager_init,
-	swap_pager_alloc,
-	swap_pager_dealloc,
-	swap_pager_getpages,
-	swap_pager_putpages,
-	swap_pager_haspage,
-	swap_pager_sync
+	swap_pager_init,	/* early system initialization of pager	*/
+	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
+	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
+	swap_pager_getpages,	/* pagein				*/
+	swap_pager_putpages,	/* pageout				*/
+	swap_pager_haspage,	/* get backing store status for page	*/
+	swap_pager_unswapped	/* remove swap related to page		*/
 };
 
-static int npendingio;
-static int dmmin;
+/*
+ * dmmax is in page-sized chunks with the new swap system.  It was
+ * dev-bsized chunks in the old.
+ *
+ * swap_*() routines are externally accessible.  swp_*() routines are
+ * internal.
+ */
+
 int dmmax;
+static int dmmax_mask;
+int nswap_lowat = 128;		/* in pages, swap_pager_full warning	*/
+int nswap_hiwat = 256;		/* in pages, swap_pager_full warning	*/
+
+static __inline void	swp_sizecheck __P((void));
+static void	swp_pager_sync_iodone __P((struct buf *bp));
+static void	swp_pager_async_iodone __P((struct buf *bp));
+
+/*
+ * Swap bitmap functions
+ */
+
+static __inline void	swp_pager_freeswapspace __P((daddr_t blk, int npages));
+static __inline daddr_t	swp_pager_getswapspace __P((int npages));
+
+/*
+ * Metadata functions
+ */
+
+static void swp_pager_meta_build __P((vm_object_t, daddr_t, daddr_t, int));
+static void swp_pager_meta_free __P((vm_object_t, daddr_t, daddr_t));
+static void swp_pager_meta_free_all __P((vm_object_t));
+static daddr_t swp_pager_meta_ctl __P((vm_object_t, vm_pindex_t, int));
 
-static int	swap_pager_block_index __P((vm_pindex_t pindex));
-static int	swap_pager_block_offset __P((vm_pindex_t pindex));
-static daddr_t *swap_pager_diskaddr __P((vm_object_t object,
-					  vm_pindex_t pindex, int *valid));
-static void	swap_pager_finish __P((swp_clean_t spc));
-static void	swap_pager_free_swap __P((vm_object_t object));
-static void	swap_pager_freeswapspace __P((vm_object_t object,
-					      unsigned int from,
-					      unsigned int to));
-static int	swap_pager_getswapspace __P((vm_object_t object,
-					     unsigned int amount,
-					     daddr_t *rtval));
-static void	swap_pager_iodone __P((struct buf *));
-static void	swap_pager_iodone1 __P((struct buf *bp));
-static void	swap_pager_reclaim __P((void));
-static void	swap_pager_ridpages __P((vm_page_t *m, int count,
-					 int reqpage));
-static void	swap_pager_setvalid __P((vm_object_t object,
-					 vm_offset_t offset, int valid));
-static __inline void	swapsizecheck __P((void));
-
-#define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE)))
+/*
+ * SWP_SIZECHECK() -	update swap_pager_full indication
+ *	
+ *	update the swap_pager_full indication and warn when we are
+ *	about to run out of swap space.
+ *
+ *	No restrictions on call
+ *	This routine may not block.
+ *	This routine must be called at splvm()
+ */
 
 static __inline void
-swapsizecheck()
+swp_sizecheck()
 {
-	if (vm_swap_size < 128 * btodb(PAGE_SIZE)) {
+	if (vm_swap_size < nswap_lowat) {
 		if (swap_pager_full == 0)
 			printf("swap_pager: out of swap space\n");
 		swap_pager_full = 1;
-	} else if (vm_swap_size > 192 * btodb(PAGE_SIZE))
+	} else if (vm_swap_size > nswap_hiwat) {
 		swap_pager_full = 0;
+	}
 }
 
+/*
+ * SWAP_PAGER_INIT() -	initialize the swap pager!
+ *
+ *	Expected to be started from system init.  NOTE:  This code is run 
+ *	before much else so be careful what you depend on.  Most of the VM
+ *	system has yet to be initialized at this point.
+ */
+
 static void
 swap_pager_init()
 {
-	int maxsafepending;
-	TAILQ_INIT(&swap_pager_object_list);
-	TAILQ_INIT(&swap_pager_un_object_list);
-
 	/*
-	 * Initialize clean lists
+	 * Initialize object lists
 	 */
-	TAILQ_INIT(&swap_pager_inuse);
-	TAILQ_INIT(&swap_pager_done);
-	TAILQ_INIT(&swap_pager_free);
-	swap_pager_free_count = 0;
+	int i;
+
+	for (i = 0; i < NOBJLISTS; ++i)
+		TAILQ_INIT(&swap_pager_object_list[i]);
+	TAILQ_INIT(&swap_pager_un_object_list);
 
 	/*
-	 * Calculate the swap allocation constants.
+	 * Device Stripe, in PAGE_SIZE'd blocks
 	 */
-	dmmin = PAGE_SIZE / DEV_BSIZE;
-	dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2;
-
-	maxsafepending = cnt.v_free_min - cnt.v_free_reserved;
-	npendingio = NPENDINGIO;
-	max_pageout_cluster = MAX_PAGEOUT_CLUSTER;
-
-	if ((2 * NPENDINGIO * MAX_PAGEOUT_CLUSTER) > maxsafepending) {
-		max_pageout_cluster = MAX_PAGEOUT_CLUSTER / 2;
-		npendingio = maxsafepending / (2 * max_pageout_cluster);
-		if (npendingio < 2)
-			npendingio = 2;
-	}
+
+	dmmax = SWB_NPAGES * 2;
+	dmmax_mask = ~(dmmax - 1);
 }
 
+/*
+ * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
+ *
+ *	Expected to be started from pageout process once, prior to entering
+ *	its main loop.
+ */
+
 void
 swap_pager_swap_init()
 {
-	swp_clean_t spc;
-	struct buf *bp;
-	int i;
+	int n;
 
 	/*
-	 * kva's are allocated here so that we dont need to keep doing
-	 * kmem_alloc pageables at runtime
+	 * Number of in-transit swap bp operations.  Don't
+	 * exhaust the pbufs completely.  Make sure we
+	 * initialize workable values (0 will work for hysteresis
+	 * but it isn't very efficient).
+	 *
+	 * The max_pageout_cluster is constrained by the bp->b_pages[]
+	 * array (MAXPHYS/PAGE_SIZE) and our locally defined
+	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
+	 * constrained by the swap device interleave stripe size.
 	 */
-	for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) {
-		spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * max_pageout_cluster);
-		if (!spc->spc_kva) {
-			break;
-		}
-		spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL);
-		if (!spc->spc_bp) {
-			kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
-			break;
-		}
-		spc->spc_flags = 0;
-		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
-		swap_pager_free_count++;
-	}
-}
 
-int
-swap_pager_swp_alloc(object, wait)
-	vm_object_t object;
-	int wait;
-{
-	sw_blk_t swb;
-	int nblocks;
-	int i, j;
-
-	nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES;
-	swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait);
-	if (swb == NULL)
-		return 1;
-
-	for (i = 0; i < nblocks; i++) {
-		swb[i].swb_valid = 0;
-		swb[i].swb_locked = 0;
-		for (j = 0; j < SWB_NPAGES; j++)
-			swb[i].swb_block[j] = SWB_EMPTY;
-	}
+	nsw_rcount = (nswbuf + 1) / 2;
+	nsw_wcount = (nswbuf + 3) / 4;
+	nsw_hysteresis = nsw_wcount / 2;
+	max_pageout_cluster = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
 
-	object->un_pager.swp.swp_nblocks = nblocks;
-	object->un_pager.swp.swp_allocsize = 0;
-	object->un_pager.swp.swp_blocks = swb;
-	object->un_pager.swp.swp_poip = 0;
+	/*
+	 * Initialize our zone.  Right now I'm just guessing on the number
+	 * we need based on the number of pages in the system.  Each swblock
+	 * can hold 16 pages, so this is probably overkill.
+	 */
 
-	if (object->handle != NULL) {
-		TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list);
-	} else {
-		TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
-	}
+	n = cnt.v_page_count * 2;
 
-	return 0;
+	swap_zone = zinit(
+	    "SWAPMETA", 
+	    sizeof(struct swblock), 
+	    n,
+	    ZONE_INTERRUPT, 
+	    1
+	);
+
+	/*
+	 * Initialize our meta-data hash table.  The swapper does not need to
+	 * be quite as efficient as the VM system, so we do not use an 
+	 * oversized hash table.
+	 *
+	 * 	n: 		size of hash table, must be power of 2
+	 *	swhash_mask:	hash table index mask
+	 */
+
+	for (n = 1; n < cnt.v_page_count / 4; n <<= 1)
+		;
+
+	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK);
+	bzero(swhash, sizeof(struct swblock *) * n);
+
+	swhash_mask = n - 1;
 }
 
 /*
- * Allocate an object and associated resources.
- * Note that if we are called from the pageout daemon (handle == NULL)
- * we should not wait for memory as it could resulting in deadlock.
+ * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
+ *			its metadata structures.
+ *
+ *	This routine is called from the mmap and fork code to create a new
+ *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
+ *	and then converting it with swp_pager_meta_build().
+ *
+ *	This routine may block in vm_object_allocate() and create a named
+ *	object lookup race, so we must interlock.   We must also run at
+ *	splvm() for the object lookup to handle races with interrupts, but
+ *	we do not have to maintain splvm() in between the lookup and the
+ *	add because (I believe) it is not possible to attempt to create
+ *	a new swap object w/handle when a default object with that handle
+ *	already exists.
  */
+
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		 vm_ooffset_t offset)
 {
 	vm_object_t object;
 
-	/*
-	 * If this is a "named" anonymous region, look it up and use the
-	 * object if it exists, otherwise allocate a new one.
-	 */
 	if (handle) {
-		object = vm_pager_object_lookup(&swap_pager_object_list, handle);
+		/*
+		 * Reference existing named region or allocate new one.  There
+		 * should not be a race here against swp_pager_meta_build()
+		 * as called from vm_page_remove() in regards to the lookup
+		 * of the handle.
+		 */
+
+		while (sw_alloc_interlock) {
+			sw_alloc_interlock = -1;
+			tsleep(&sw_alloc_interlock, PVM, "swpalc", 0);
+		}
+		sw_alloc_interlock = 1;
+
+		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
+
 		if (object != NULL) {
 			vm_object_reference(object);
 		} else {
-			/*
-			 * XXX - there is a race condition here. Two processes
-			 * can request the same named object simultaneuously,
-			 * and if one blocks for memory, the result is a disaster.
-			 * Probably quite rare, but is yet another reason to just
-			 * rip support of "named anonymous regions" out altogether.
-			 */
-			object = vm_object_allocate(OBJT_SWAP,
+			object = vm_object_allocate(OBJT_DEFAULT,
 				OFF_TO_IDX(offset + PAGE_MASK + size));
 			object->handle = handle;
-			(void) swap_pager_swp_alloc(object, M_WAITOK);
+
+			swp_pager_meta_build(
+			    object,
+			    0,
+			    SWAPBLK_NONE,
+			    0
+			);
 		}
+
+		if (sw_alloc_interlock < 0)
+			wakeup(&sw_alloc_interlock);
+
+		sw_alloc_interlock = 0;
 	} else {
-		object = vm_object_allocate(OBJT_SWAP,
+		object = vm_object_allocate(OBJT_DEFAULT,
 			OFF_TO_IDX(offset + PAGE_MASK + size));
-		(void) swap_pager_swp_alloc(object, M_WAITOK);
+
+		swp_pager_meta_build(
+		    object,
+		    0,
+		    SWAPBLK_NONE,
+		    0
+		);
 	}
 
 	return (object);
 }
 
 /*
- * returns disk block associated with pager and offset
- * additionally, as a side effect returns a flag indicating
- * if the block has been written
+ * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
+ *
+ *	The swap backing for the object is destroyed.  The code is 
+ *	designed such that we can reinstantiate it later, but this
+ *	routine is typically called only when the entire object is
+ *	about to be destroyed.
+ *
+ *	This routine may block, but no longer does. 
+ *
+ *	The object must be locked or unreferenceable.
  */
 
-static __inline daddr_t *
-swap_pager_diskaddr(object, pindex, valid)
+static void
+swap_pager_dealloc(object)
 	vm_object_t object;
-	vm_pindex_t pindex;
-	int *valid;
 {
-	register sw_blk_t swb;
-	int ix;
-
-	if (valid)
-		*valid = 0;
-	ix = pindex / SWB_NPAGES;
-	if ((ix >= object->un_pager.swp.swp_nblocks) ||
-	    (pindex >= object->size)) {
-		return (FALSE);
+	/*
+	 * Remove from list right away so lookups will fail if we block for
+	 * pageout completion.
+	 */
+
+	if (object->handle == NULL) {
+		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
+	} else {
+		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
 	}
-	swb = &object->un_pager.swp.swp_blocks[ix];
-	ix = pindex % SWB_NPAGES;
-	if (valid)
-		*valid = swb->swb_valid & (1 << ix);
-	return &swb->swb_block[ix];
-}
 
-/*
- * Utility routine to set the valid (written) bit for
- * a block associated with a pager and offset
- */
-static void
-swap_pager_setvalid(object, offset, valid)
-	vm_object_t object;
-	vm_offset_t offset;
-	int valid;
-{
-	register sw_blk_t swb;
-	int ix;
+	vm_object_pip_wait(object, "swpdea");
 
-	ix = offset / SWB_NPAGES;
-	if (ix >= object->un_pager.swp.swp_nblocks)
-		return;
+	/*
+	 * Free all remaining metadata.  We only bother to free it from 
+	 * the swap meta data.  We do not attempt to free swapblk's still
+	 * associated with vm_page_t's for this object.  We do not care
+	 * if paging is still in progress on some objects.
+	 */
 
-	swb = &object->un_pager.swp.swp_blocks[ix];
-	ix = offset % SWB_NPAGES;
-	if (valid)
-		swb->swb_valid |= (1 << ix);
-	else
-		swb->swb_valid &= ~(1 << ix);
-	return;
+	swp_pager_meta_free_all(object);
 }
 
+/************************************************************************
+ *			SWAP PAGER BITMAP ROUTINES			*
+ ************************************************************************/
+
 /*
- * this routine allocates swap space with a fragmentation
- * minimization policy.
+ * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
+ *
+ *	Allocate swap for the requested number of pages.  The starting
+ *	swap block number (a page index) is returned or SWAPBLK_NONE
+ *	if the allocation failed.
+ *
+ *	Also has the side effect of advising that somebody made a mistake
+ *	when they configured swap and didn't configure enough.
+ *
+ *	Must be called at splvm() to avoid races with bitmap frees from
+ *	vm_page_remove() aka swap_pager_page_removed().
+ *
+ *	This routine may not block
+ *	This routine must be called at splvm().
  */
-static int
-swap_pager_getswapspace(object, amount, rtval)
-	vm_object_t object;
-	unsigned int amount;
-	daddr_t *rtval;
+
+static __inline daddr_t
+swp_pager_getswapspace(npages)
+	int npages;
 {
-	unsigned location;
+	daddr_t blk;
 
-	vm_swap_size -= amount;
-		
-	if (!rlist_alloc(&swaplist, amount, &location)) {
-		vm_swap_size += amount;
-		return 0;
+	if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
+		printf("swap_pager_getswapspace: failed\n");
 	} else {
-		swapsizecheck();
-		object->un_pager.swp.swp_allocsize += amount;
-		*rtval = location;
-		return 1;
+		vm_swap_size -= npages;
+		swp_sizecheck();
 	}
+	return(blk);
 }
 
 /*
- * this routine frees swap space with a fragmentation
- * minimization policy.
+ * SWP_PAGER_FREESWAPSPACE() -	free raw swap space 
+ *
+ *	This routine returns the specified swap blocks back to the bitmap.
+ *
+ *	Note:  This routine may not block (it could in the old swap code),
+ *	and through the use of the new blist routines it does not block.
+ *
+ *	We must be called at splvm() to avoid races with bitmap frees from
+ *	vm_page_remove() aka swap_pager_page_removed().
+ *
+ *	This routine may not block
+ *	This routine must be called at splvm().
  */
-static void
-swap_pager_freeswapspace(object, from, to)
-	vm_object_t object;
-	unsigned int from;
-	unsigned int to;
+
+static __inline void
+swp_pager_freeswapspace(blk, npages)
+	daddr_t blk;
+	int npages;
 {
-	rlist_free(&swaplist, from, to);
-	vm_swap_size += (to - from) + 1;
-	object->un_pager.swp.swp_allocsize -= (to - from) + 1;
-	swapsizecheck();
+	blist_free(swapblist, blk, npages);
+	vm_swap_size += npages;
+	swp_sizecheck();
 }
+
 /*
- * this routine frees swap blocks from a specified pager
+ * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
+ *				range within an object.
+ *
+ *	This is a globally accessible routine.
+ *
+ *	This routine removes swapblk assignments from swap metadata.
+ *
+ *	The external callers of this routine typically have already destroyed 
+ *	or renamed vm_page_t's associated with this range in the object so 
+ *	we should be ok.
  */
+
 void
 swap_pager_freespace(object, start, size)
 	vm_object_t object;
 	vm_pindex_t start;
 	vm_size_t size;
 {
-	vm_pindex_t i;
-	int s;
-
-	s = splvm();
-	for (i = start; i < start + size; i += 1) {
-		int valid;
-		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
-
-		if (addr && *addr != SWB_EMPTY) {
-			swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1);
-			if (valid) {
-				swap_pager_setvalid(object, i, 0);
-			}
-			*addr = SWB_EMPTY;
-		}
-	}
-	splx(s);
+	swp_pager_meta_free(object, start, size);
 }
 
 /*
- * same as freespace, but don't free, just force a DMZ next time
- */
-void
-swap_pager_dmzspace(object, start, size)
-	vm_object_t object;
-	vm_pindex_t start;
-	vm_size_t size;
-{
-	vm_pindex_t i;
-	int s;
-
-	s = splvm();
-	for (i = start; i < start + size; i += 1) {
-		int valid;
-		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
-
-		if (addr && *addr != SWB_EMPTY) {
-			if (valid) {
-				swap_pager_setvalid(object, i, 0);
-			}
-		}
-	}
-	splx(s);
-}
-
-static void
-swap_pager_free_swap(object)
-	vm_object_t object;
-{
-	register int i, j;
-	register sw_blk_t swb;
-	int first_block=0, block_count=0;
-	int s;
-	/*
-	 * Free left over swap blocks
-	 */
-	swb = object->un_pager.swp.swp_blocks;
-	if (swb == NULL) {
-		return;
-	}
-
-	s = splvm();
-	for (i = 0; i < object->un_pager.swp.swp_nblocks; i++, swb++) {
-		for (j = 0; j < SWB_NPAGES; j++) {
-			if (swb->swb_block[j] != SWB_EMPTY) {
-				/*
-   				 * initially the length of the run is zero
-   				 */
-				if (block_count == 0) {
-					first_block = swb->swb_block[j];
-					block_count = btodb(PAGE_SIZE);
-					swb->swb_block[j] = SWB_EMPTY;
-				/*
-   				 * if the new block can be included into the current run
-   				 */
-				} else if (swb->swb_block[j] == first_block + block_count) {
-					block_count += btodb(PAGE_SIZE);
-					swb->swb_block[j] = SWB_EMPTY;
-				/*
-   				 * terminate the previous run, and start a new one
-   				 */
-				} else {
-					swap_pager_freeswapspace(object, first_block,
-   					(unsigned) first_block + block_count - 1);
-					first_block = swb->swb_block[j];
-					block_count = btodb(PAGE_SIZE);
-					swb->swb_block[j] = SWB_EMPTY;
-				}
-			}
-		}
-	}
-
-	if (block_count) {
-		swap_pager_freeswapspace(object, first_block,
-		   	 (unsigned) first_block + block_count - 1);
-	}
-	splx(s);
-}
-
-
-/*
- * swap_pager_reclaim frees up over-allocated space from all pagers
- * this eliminates internal fragmentation due to allocation of space
- * for segments that are never swapped to. It has been written so that
- * it does not block until the rlist_free operation occurs; it keeps
- * the queues consistant.
- */
-
-/*
- * Maximum number of blocks (pages) to reclaim per pass
- */
-#define MAXRECLAIM 128
-
-static void
-swap_pager_reclaim()
-{
-	vm_object_t object;
-	int i, j, k;
-	int s;
-	int reclaimcount;
-	static struct {
-		int address;
-		vm_object_t object;
-	} reclaims[MAXRECLAIM];
-	static int in_reclaim;
-
-	/*
-	 * allow only one process to be in the swap_pager_reclaim subroutine
-	 */
-	s = splvm();
-	if (in_reclaim) {
-		tsleep(&in_reclaim, PSWP, "swrclm", 0);
-		splx(s);
-		return;
-	}
-	in_reclaim = 1;
-	reclaimcount = 0;
-
-	/* for each pager queue */
-	for (k = 0; swp_qs[k]; k++) {
-
-		object = TAILQ_FIRST(swp_qs[k]);
-		while (object && (reclaimcount < MAXRECLAIM)) {
-
-			/*
-			 * see if any blocks associated with a pager has been
-			 * allocated but not used (written)
-			 */
-			if ((object->flags & OBJ_DEAD) == 0 &&
-				(object->paging_in_progress == 0)) {
-				for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) {
-					sw_blk_t swb = &object->un_pager.swp.swp_blocks[i];
-
-					if (swb->swb_locked)
-						continue;
-					for (j = 0; j < SWB_NPAGES; j++) {
-						if (swb->swb_block[j] != SWB_EMPTY &&
-						    (swb->swb_valid & (1 << j)) == 0) {
-							reclaims[reclaimcount].address = swb->swb_block[j];
-							reclaims[reclaimcount++].object = object;
-							swb->swb_block[j] = SWB_EMPTY;
-							if (reclaimcount >= MAXRECLAIM)
-								goto rfinished;
-						}
-					}
-				}
-			}
-			object = TAILQ_NEXT(object, pager_object_list);
-		}
-	}
-
-rfinished:
-
-	/*
-	 * free the blocks that have been added to the reclaim list
-	 */
-	for (i = 0; i < reclaimcount; i++) {
-		swap_pager_freeswapspace(reclaims[i].object,
-		    reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1);
-	}
-	splx(s);
-	in_reclaim = 0;
-	wakeup(&in_reclaim);
-}
-
-
-/*
- * swap_pager_copy copies blocks from one pager to another and
- * destroys the source pager
+ * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
+ *			and destroy the source.
+ *
+ *	Copy any valid swapblks from the source to the destination.  In
+ *	cases where both the source and destination have a valid swapblk,
+ *	we keep the destination's.
+ *
+ *	This routine is allowed to block.  It may block allocating metadata
+ *	indirectly through swp_pager_meta_build() or if paging is still in
+ *	progress on the source. 
+ *
+ *	XXX vm_page_collapse() kinda expects us not to block because we 
+ *	supposedly do not need to allocate memory, but for the moment we
+ *	*may* have to get a little memory from the zone allocator, but
+ *	it is taken from the interrupt memory.  We should be ok. 
+ *
+ *	The source object contains no vm_page_t's (which is just as well)
+ *
+ *	The source object is of type OBJT_SWAP.
+ *
+ *	The source and destination objects must be 
+ *	locked or inaccessible (XXX are they ???)
  */
 
 void
-swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset,
-	offset, destroysource)
+swap_pager_copy(srcobject, dstobject, offset, destroysource)
 	vm_object_t srcobject;
-	vm_pindex_t srcoffset;
 	vm_object_t dstobject;
-	vm_pindex_t dstoffset;
 	vm_pindex_t offset;
 	int destroysource;
 {
 	vm_pindex_t i;
-	int origsize;
-	int s;
-
-	if (vm_swap_size)
-		no_swap_space = 0;
-
-	origsize = srcobject->un_pager.swp.swp_allocsize;
 
 	/*
-	 * remove the source object from the swap_pager internal queue
+	 * If destroysource is set, we remove the source object from the 
+	 * swap_pager internal queue now. 
 	 */
+
 	if (destroysource) {
 		if (srcobject->handle == NULL) {
-			TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list);
+			TAILQ_REMOVE(
+			    &swap_pager_un_object_list, 
+			    srcobject, 
+			    pager_object_list
+			);
 		} else {
-			TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list);
+			TAILQ_REMOVE(
+			    NOBJLIST(srcobject->handle),
+			    srcobject,
+			    pager_object_list
+			);
 		}
 	}
 
-	s = splvm();
-	while (srcobject->un_pager.swp.swp_poip) {
-		tsleep(srcobject, PVM, "spgout", 0);
-	}
-
 	/*
-	 * clean all of the pages that are currently active and finished
+	 * transfer source to destination.
 	 */
-	if (swap_pager_free_pending)
-		swap_pager_sync();
 
-	/*
-	 * transfer source to destination
-	 */
-	for (i = 0; i < dstobject->size; i += 1) {
-		int srcvalid, dstvalid;
-		daddr_t *srcaddrp = swap_pager_diskaddr(srcobject,
-				i + offset + srcoffset, &srcvalid);
-		daddr_t *dstaddrp;
+	for (i = 0; i < dstobject->size; ++i) {
+		daddr_t dstaddr;
 
 		/*
-		 * see if the source has space allocated
+		 * Locate (without changing) the swapblk on the destination,
+		 * unless it is invalid in which case free it silently, or
+		 * if the destination is a resident page, in which case the
+		 * source is thrown away.
 		 */
-		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
+
+		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
+
+		if (dstaddr == SWAPBLK_NONE) {
 			/*
-			 * if the source is valid and the dest has no space,
-			 * then copy the allocation from the srouce to the
-			 * dest.
+			 * Destination has no swapblk and is not resident,
+			 * copy source.
 			 */
-			if (srcvalid) {
-				dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset,
-							&dstvalid);
-				/*
-				 * if the dest already has a valid block,
-				 * deallocate the source block without
-				 * copying.
-				 */
-				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
-					swap_pager_freeswapspace(dstobject, *dstaddrp,
-						*dstaddrp + btodb(PAGE_SIZE) - 1);
-					*dstaddrp = SWB_EMPTY;
-				}
-				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
-					*dstaddrp = *srcaddrp;
-					*srcaddrp = SWB_EMPTY;
-					dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE);
-					srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE);
-					swap_pager_setvalid(dstobject, i + dstoffset, 1);
-				}
-			}
+			daddr_t srcaddr;
+
+			srcaddr = swp_pager_meta_ctl(
+			    srcobject, 
+			    i + offset,
+			    SWM_POP
+			);
+
+			if (srcaddr != SWAPBLK_NONE)
+				swp_pager_meta_build(dstobject, i, srcaddr, 1);
+		} else {
 			/*
-			 * if the source is not empty at this point, then
-			 * deallocate the space.
+			 * Destination has valid swapblk or it is represented
+			 * by a resident page.  We destroy the sourceblock.
 			 */
-			if (*srcaddrp != SWB_EMPTY) {
-				swap_pager_freeswapspace(srcobject, *srcaddrp,
-					*srcaddrp + btodb(PAGE_SIZE) - 1);
-				*srcaddrp = SWB_EMPTY;
-			}
+			
+			swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
 		}
 	}
-	splx(s);
 
 	/*
-	 * Free left over swap blocks
+	 * Free left over swap blocks in source.
+	 *
+	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
+	 * double-remove the object from the swap queues.
 	 */
-	if (destroysource) {
-		swap_pager_free_swap(srcobject);
 
-		if (srcobject->un_pager.swp.swp_allocsize) {
-			printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n",
-			    srcobject->un_pager.swp.swp_allocsize, origsize);
-		}
-
-		free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA);
-		srcobject->un_pager.swp.swp_blocks = NULL;
+	if (destroysource) {
+		swp_pager_meta_free_all(srcobject);
+		/*
+		 * Reverting the type is not necessary, the caller is going
+		 * to destroy srcobject directly, but I'm doing it here
+		 * for consistancy since we've removed the object from its
+		 * queues.
+		 */
+		srcobject->type = OBJT_DEFAULT;
 	}
 	return;
 }
 
-static void
-swap_pager_dealloc(object)
+/*
+ * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
+ *				the requested page.
+ *
+ *	We determine whether good backing store exists for the requested
+ *	page and return TRUE if it does, FALSE if it doesn't.
+ *
+ *	If TRUE, we also try to determine how much valid, contiguous backing
+ *	store exists before and after the requested page within a reasonable
+ *	distance.  We do not try to restrict it to the swap device stripe
+ *	(that is handled in getpages/putpages).  It probably isn't worth
+ *	doing here.
+ */
+
+boolean_t
+swap_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
+	vm_pindex_t pindex;
+	int *before;
+	int *after;
 {
-	int s;
-	sw_blk_t swb;
+	daddr_t blk0;
 
 	/*
-	 * Remove from list right away so lookups will fail if we block for
-	 * pageout completion.
+	 * do we have good backing store at the requested index ?
 	 */
-	if (object->handle == NULL) {
-		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
-	} else {
-		TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list);
-	}
 
-	/*
-	 * Wait for all pageouts to finish and remove all entries from
-	 * cleaning list.
-	 */
+	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 
-	s = splvm();
-	while (object->un_pager.swp.swp_poip) {
-		tsleep(object, PVM, "swpout", 0);
+	if (blk0 & SWAPBLK_NONE) {
+		if (before)
+			*before = 0;
+		if (after)
+			*after = 0;
+		return (FALSE);
 	}
-	splx(s);
-
-	if (swap_pager_free_pending)
-		swap_pager_sync();
 
 	/*
-	 * Free left over swap blocks
+	 * find backwards-looking contiguous good backing store
 	 */
-	swap_pager_free_swap(object);
 
-	if (object->un_pager.swp.swp_allocsize) {
-		printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n",
-		    object->un_pager.swp.swp_allocsize);
-	}
-	swb = object->un_pager.swp.swp_blocks;
-	if (swb) {
-		/*
-   		* Free swap management resources
-   		*/
-		free(swb, M_VMPGDATA);
-		object->un_pager.swp.swp_blocks = NULL;
-	}
-}
+	if (before != NULL) {
+		int i;
 
-static __inline int
-swap_pager_block_index(pindex)
-	vm_pindex_t pindex;
-{
-	return (pindex / SWB_NPAGES);
-}
-
-static __inline int
-swap_pager_block_offset(pindex)
-	vm_pindex_t pindex;
-{
-	return (pindex % SWB_NPAGES);
-}
+		for (i = 1; i < (SWB_NPAGES/2); ++i) {
+			daddr_t blk;
 
-/*
- * swap_pager_haspage returns TRUE if the pager has data that has
- * been written out.
- */
-static boolean_t
-swap_pager_haspage(object, pindex, before, after)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *before;
-	int *after;
-{
-	register sw_blk_t swb;
-	int ix;
-
-	if (before != NULL)
-		*before = 0;
-	if (after != NULL)
-		*after = 0;
-	ix = pindex / SWB_NPAGES;
-	if (ix >= object->un_pager.swp.swp_nblocks) {
-		return (FALSE);
+			if (i > pindex)
+				break;
+			blk = swp_pager_meta_ctl(object, pindex - i, 0);
+			if (blk & SWAPBLK_NONE)
+				break;
+			if (blk != blk0 - i)
+				break;
+		}
+		*before = (i - 1);
 	}
-	swb = &object->un_pager.swp.swp_blocks[ix];
-	ix = pindex % SWB_NPAGES;
-
-	if (swb->swb_block[ix] != SWB_EMPTY) {
-
-		if (swb->swb_valid & (1 << ix)) {
-			int tix;
-			if (before) {
-				for(tix = ix - 1; tix >= 0; --tix) {
-					if ((swb->swb_valid & (1 << tix)) == 0)
-						break;
-					if ((swb->swb_block[tix] +
-						(ix - tix) * (PAGE_SIZE/DEV_BSIZE)) !=
-						swb->swb_block[ix])
-						break;
-					(*before)++;
-				}
-			}
 
-			if (after) {
-				for(tix = ix + 1; tix < SWB_NPAGES; tix++) {
-					if ((swb->swb_valid & (1 << tix)) == 0)
-						break;
-					if ((swb->swb_block[tix] -
-						(tix - ix) * (PAGE_SIZE/DEV_BSIZE)) !=
-						swb->swb_block[ix])
-						break;
-					(*after)++;
-				}
-			}
+	/*
+	 * find forward-looking contiguous good backing store
+	 */
 
-			return TRUE;
+	if (after != NULL) {
+		int i;
+
+		for (i = 1; i < (SWB_NPAGES/2); ++i) {
+			daddr_t blk;
+
+			blk = swp_pager_meta_ctl(object, pindex + i, 0);
+			if (blk & SWAPBLK_NONE)
+				break;
+			if (blk != blk0 + i)
+				break;
 		}
+		*after = (i - 1);
 	}
-	return (FALSE);
-}
 
-/*
- * Wakeup based upon spc state
- */
-static void
-spc_wakeup(void)
-{
-	if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) {
-		swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT;
-		wakeup(&swap_pager_needflags);
-	} else if ((swap_pager_needflags & SWAP_FREE_NEEDED) &&
-		swap_pager_free_count >= ((2 * npendingio) / 3)) {
-		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
-		wakeup(&swap_pager_free);
-	}
+	return (TRUE);
 }
 
 /*
- * Free an spc structure
+ * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
+ *
+ *	This removes any associated swap backing store, whether valid or
+ *	not, from the page.  
+ *
+ *	This routine is typically called when a page is made dirty, at
+ *	which point any associated swap can be freed.  MADV_FREE also
+ *	calls us in a special-case situation
+ *
+ *	NOTE!!!  If the page is clean and the swap was valid, the caller
+ *	should make the page dirty before calling this routine.  This routine
+ *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
+ *	depends on it.
+ *
+ *	This routine may not block
  */
-static void
-spc_free(spc)
-	swp_clean_t spc;
-{
-	spc->spc_flags = 0;
-	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
-	swap_pager_free_count++;
-	if (swap_pager_needflags) {
-		spc_wakeup();
-	}
-}
 
-/*
- * swap_pager_ridpages is a convienience routine that deallocates all
- * but the required page.  this is usually used in error returns that
- * need to invalidate the "extra" readahead pages.
- */
 static void
-swap_pager_ridpages(m, count, reqpage)
-	vm_page_t *m;
-	int count;
-	int reqpage;
+swap_pager_unswapped(m)
+	vm_page_t m;
 {
-	int i;
-
-	for (i = 0; i < count; i++) {
-		if (i != reqpage) {
-			vm_page_free(m[i]);
-		}
-	}
+	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 }
 
 /*
- * swap_pager_iodone1 is the completion routine for both reads and async writes
+ * SWAP_PAGER_GETPAGES() - bring pages in from swap
+ *
+ *	Attempt to retrieve (m, count) pages from backing store, but make
+ *	sure we retrieve at least m[reqpage].  We try to load in as large
+ *	a chunk surrounding m[reqpage] as is contiguous in swap and which
+ *	belongs to the same object.
+ *
+ *	The code is designed for asynchronous operation and 
+ *	immediate-notification of 'reqpage' but tends not to be
+ *	used that way.  Please do not optimize-out this algorithmic
+ *	feature, I intend to improve on it in the future.
+ *
+ *	The parent has a single vm_object_pip_add() reference prior to
+ *	calling us and we should return with the same.
+ *
+ *	The parent has BUSY'd the pages.  We should return with 'm'
+ *	left busy, but the others adjusted.
  */
-static void
-swap_pager_iodone1(bp)
-	struct buf *bp;
-{
-	bp->b_flags |= B_DONE;
-	bp->b_flags &= ~B_ASYNC;
-	wakeup(bp);
-}
 
 static int
 swap_pager_getpages(object, m, count, reqpage)
@@ -926,208 +745,235 @@ swap_pager_getpages(object, m, count, reqpage)
 	vm_page_t *m;
 	int count, reqpage;
 {
-	register struct buf *bp;
-	sw_blk_t swb[count];
-	register int s;
+	struct buf *bp;
+	vm_page_t mreq;
+	int s;
 	int i;
-	boolean_t rv;
-	vm_offset_t kva, off[count];
-	vm_pindex_t paging_offset;
-	int reqaddr[count];
-	int sequential;
-
-	int first, last;
-	int failed;
-	int reqdskregion;
-
-	object = m[reqpage]->object;
-	paging_offset = OFF_TO_IDX(object->paging_offset);
-	sequential = (m[reqpage]->pindex == (object->last_read + 1));
-
-	for (i = 0; i < count; i++) {
-		vm_pindex_t fidx = m[i]->pindex + paging_offset;
-		int ix = swap_pager_block_index(fidx);
-
-		if (ix >= object->un_pager.swp.swp_nblocks) {
-			int j;
-
-			if (i <= reqpage) {
-				swap_pager_ridpages(m, count, reqpage);
-				return (VM_PAGER_FAIL);
-			}
-			for (j = i; j < count; j++) {
-				vm_page_free(m[j]);
-			}
-			count = i;
+	int j;
+	daddr_t blk;
+	vm_offset_t kva;
+	vm_pindex_t lastpindex;
+
+	mreq = m[reqpage];
+
+#if !defined(MAX_PERF)
+	if (mreq->object != object) {
+		panic("swap_pager_getpages: object mismatch %p/%p", 
+		    object, 
+		    mreq->object
+		);
+	}
+#endif
+	/*
+	 * Calculate range to retrieve.  The pages have already been assigned
+	 * their swapblks.  We require a *contiguous* range that falls entirely
+	 * within a single device stripe.   If we do not supply it, bad things
+	 * happen.
+	 */
+
+
+	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
+
+	for (i = reqpage - 1; i >= 0; --i) {
+		daddr_t iblk;
+
+		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
+		if (iblk & SWAPBLK_NONE)
+			break;
+
+		if ((blk ^ iblk) & dmmax_mask)
+			break;
+
+		if (blk != iblk + (reqpage - i))
 			break;
-		}
-		swb[i] = &object->un_pager.swp.swp_blocks[ix];
-		off[i] = swap_pager_block_offset(fidx);
-		reqaddr[i] = swb[i]->swb_block[off[i]];
 	}
+	++i;
 
-	/* make sure that our required input request is existant */
+	for (j = reqpage + 1; j < count; ++j) {
+		daddr_t jblk;
 
-	if (reqaddr[reqpage] == SWB_EMPTY ||
-	    (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
-		swap_pager_ridpages(m, count, reqpage);
-		return (VM_PAGER_FAIL);
+		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
+		if (jblk & SWAPBLK_NONE)
+			break;
+
+		if ((blk ^ jblk) & dmmax_mask)
+			break;
+
+		if (blk != jblk - (j - reqpage))
+			break;
 	}
-	reqdskregion = reqaddr[reqpage] / dmmax;
 
 	/*
-	 * search backwards for the first contiguous page to transfer
+	 * If blk itself is bad, well, we can't do any I/O.  This should
+	 * already be covered as a side effect, but I'm making sure.
 	 */
-	failed = 0;
-	first = 0;
-	for (i = reqpage - 1; i >= 0; --i) {
-		if (sequential || failed || (reqaddr[i] == SWB_EMPTY) ||
-		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
-		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
-		    ((reqaddr[i] / dmmax) != reqdskregion)) {
-			failed = 1;
-			vm_page_free(m[i]);
-			if (first == 0)
-				first = i + 1;
-		}
+
+	if (blk & SWAPBLK_NONE) {
+		i = reqpage;
+		j = reqpage + 1;
 	}
+
 	/*
-	 * search forwards for the last contiguous page to transfer
+	 * free pages outside our collection range.   Note: we never free
+	 * mreq, it must remain busy throughout.
 	 */
-	failed = 0;
-	last = count;
-	for (i = reqpage + 1; i < count; i++) {
-		if (failed || (reqaddr[i] == SWB_EMPTY) ||
-		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
-		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
-		    ((reqaddr[i] / dmmax) != reqdskregion)) {
-			failed = 1;
-			vm_page_free(m[i]);
-			if (last == count)
-				last = i;
-		}
-	}
 
-	count = last;
-	if (first != 0) {
-		for (i = first; i < count; i++) {
-			m[i - first] = m[i];
-			reqaddr[i - first] = reqaddr[i];
-			off[i - first] = off[i];
+	{
+		int k;
+
+		for (k = 0; k < i; ++k) {
+			vm_page_free(m[k]);
+		}
+		for (k = j; k < count; ++k) {
+			vm_page_free(m[k]);
 		}
-		count -= first;
-		reqpage -= first;
 	}
-	++swb[reqpage]->swb_locked;
 
 	/*
-	 * at this point: "m" is a pointer to the array of vm_page_t for
-	 * paging I/O "count" is the number of vm_page_t entries represented
-	 * by "m" "object" is the vm_object_t for I/O "reqpage" is the index
-	 * into "m" for the page actually faulted
+	 * Return VM_PAGER_FAIL if we have nothing
+	 * to do.  Return mreq still busy, but the
+	 * others unbusied.
 	 */
 
+	if (blk & SWAPBLK_NONE)
+		return(VM_PAGER_FAIL);
+
+
 	/*
 	 * Get a swap buffer header to perform the IO
 	 */
-	bp = getpbuf();
+
+	bp = getpbuf(&nsw_rcount);
 	kva = (vm_offset_t) bp->b_data;
 
 	/*
 	 * map our page(s) into kva for input
+	 *
+	 * NOTE: B_PAGING is set by pbgetvp()
 	 */
-	pmap_qenter(kva, m, count);
 
-	bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING;
-	bp->b_iodone = swap_pager_iodone1;
+	pmap_qenter(kva, m + i, j - i);
+
+	bp->b_flags = B_BUSY | B_READ | B_CALL;
+	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 	crhold(bp->b_rcred);
 	crhold(bp->b_wcred);
 	bp->b_data = (caddr_t) kva;
-	bp->b_blkno = reqaddr[0];
-	bp->b_bcount = PAGE_SIZE * count;
-	bp->b_bufsize = PAGE_SIZE * count;
+	/*
+	 * b_blkno is in page-sized chunks.  swapblk is valid, too, so
+	 * we don't have to mask it against SWAPBLK_MASK.
+	 */
+	bp->b_blkno = blk - (reqpage - i);
+	bp->b_bcount = PAGE_SIZE * (j - i);
+	bp->b_bufsize = PAGE_SIZE * (j - i);
+	bp->b_pager.pg_reqpage = reqpage - i;
+
+	{
+		int k;
+
+		for (k = i; k < j; ++k) {
+			bp->b_pages[k - i] = m[k];
+			vm_page_flag_set(m[k], PG_SWAPINPROG);
+		}
+	}
+	bp->b_npages = j - i;
 
 	pbgetvp(swapdev_vp, bp);
 
 	cnt.v_swapin++;
-	cnt.v_swappgsin += count;
+	cnt.v_swappgsin += bp->b_npages;
+
+	/*
+	 * We still hold the lock on mreq, and our automatic completion routine
+	 * does not remove it.
+	 */
+
+	vm_object_pip_add(mreq->object, bp->b_npages);
+	lastpindex = m[j-1]->pindex;
+
 	/*
-	 * perform the I/O
+	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
+	 * this point because we automatically release it on completion.
+	 * Instead, we look at the one page we are interested in which we
+	 * still hold a lock on even through the I/O completion.
+	 *
+	 * The other pages in our m[] array are also released on completion,
+	 * so we cannot assume they are valid anymore either.
+	 *
+	 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
 	 */
+
 	VOP_STRATEGY(bp->b_vp, bp);
 
 	/*
-	 * wait for the sync I/O to complete
+	 * wait for the page we want to complete.  PG_SWAPINPROG is always
+	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
+	 * is set in the meta-data.
 	 */
+
 	s = splvm();
-	while ((bp->b_flags & B_DONE) == 0) {
-		if (tsleep(bp, PVM, "swread", hz*20)) {
+
+	while ((mreq->flags & PG_SWAPINPROG) != 0) {
+		vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
+		cnt.v_intrans++;
+		if (tsleep(mreq, PSWP, "swread", hz*20)) {
 			printf(
-"swap_pager: indefinite wait buffer: device: %#lx, blkno: %ld, size: %ld\n",
+			    "swap_pager: indefinite wait buffer: device:"
+				" %#lx, blkno: %ld, size: %ld\n",
 			    (u_long)bp->b_dev, (long)bp->b_blkno,
-			    (long)bp->b_bcount);
+			    (long)bp->b_bcount
+			);
 		}
 	}
 
-	if (bp->b_flags & B_ERROR) {
-		printf(
-"swap_pager: I/O error - pagein failed; blkno %ld, size %ld, error %d\n",
-		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
-		rv = VM_PAGER_ERROR;
-	} else {
-		rv = VM_PAGER_OK;
-	}
-
 	splx(s);
-	swb[reqpage]->swb_locked--;
-
-	/*
-	 * remove the mapping for kernel virtual
-	 */
-	pmap_qremove(kva, count);
 
 	/*
-	 * release the physical I/O buffer
-	 */
-	relpbuf(bp);
-	/*
-	 * finish up input if everything is ok
+	 * mreq is left bussied after completion, but all the other pages
+	 * are freed.  If we had an unrecoverable read error the page will
+	 * not be valid.
 	 */
-	if (rv == VM_PAGER_OK) {
-		for (i = 0; i < count; i++) {
-			m[i]->dirty = 0;
-			vm_page_flag_clear(m[i], PG_ZERO);
-			if (i != reqpage) {
-				/*
-				 * whether or not to leave the page
-				 * activated is up in the air, but we
-				 * should put the page on a page queue
-				 * somewhere. (it already is in the
-				 * object). After some emperical
-				 * results, it is best to deactivate
-				 * the readahead pages.
-				 */
-				vm_page_deactivate(m[i]);
 
-				/*
-				 * just in case someone was asking for
-				 * this page we now tell them that it
-				 * is ok to use
-				 */
-				m[i]->valid = VM_PAGE_BITS_ALL;
-				vm_page_wakeup(m[i]);
-			}
-		}
-
-		m[reqpage]->object->last_read = m[count-1]->pindex;
+	if (mreq->valid != VM_PAGE_BITS_ALL) {
+		return(VM_PAGER_ERROR);
 	} else {
-		swap_pager_ridpages(m, count, reqpage);
+		mreq->object->last_read = lastpindex;
+		return(VM_PAGER_OK);
 	}
-	return (rv);
+
+	/*
+	 * A final note: in a low swap situation, we cannot deallocate swap
+	 * and mark a page dirty here because the caller is likely to mark
+	 * the page clean when we return, causing the page to possibly revert 
+	 * to all-zero's later.
+	 */
 }
 
+/*
+ *	swap_pager_putpages: 
+ *
+ *	Assign swap (if necessary) and initiate I/O on the specified pages.
+ *
+ *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
+ *	are automatically converted to SWAP objects.
+ *
+ *	In a low memory situation we may block in VOP_STRATEGY(), but the new 
+ *	vm_page reservation system coupled with properly written VFS devices 
+ *	should ensure that no low-memory deadlock occurs.  This is an area
+ *	which needs work.
+ *
+ *	The parent has N vm_object_pip_add() references prior to
+ *	calling us and will remove references for rtvals[] that are
+ *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
+ *	completion.
+ *
+ *	The parent has soft-busy'd the pages it passes us and will unbusy
+ *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
+ *	We need to unbusy the rest on I/O completion.
+ */
+
 int
 swap_pager_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
@@ -1136,534 +982,749 @@ swap_pager_putpages(object, m, count, sync, rtvals)
 	boolean_t sync;
 	int *rtvals;
 {
-	register struct buf *bp;
-	sw_blk_t swb[count];
-	register int s;
-	int i, j, ix, firstidx, lastidx;
-	boolean_t rv;
-	vm_offset_t kva, off, fidx;
-	swp_clean_t spc;
-	vm_pindex_t paging_pindex;
-	int reqaddr[count];
-	int failed;
-
-	if (vm_swap_size)
-		no_swap_space = 0;
-
-	if (no_swap_space) {
-		for (i = 0; i < count; i++)
-			rtvals[i] = VM_PAGER_FAIL;
-		return VM_PAGER_FAIL;
+	int i;
+	int n = 0;
+	int grv = VM_PAGER_OK;
+
+#if !defined(MAX_PERF)
+	if (count && m[0]->object != object) {
+		panic("swap_pager_getpages: object mismatch %p/%p", 
+		    object, 
+		    m[0]->object
+		);
+	}
+#endif
+	/*
+	 * Step 1
+	 *
+	 * Turn object into OBJT_SWAP
+	 * check for bogus sysops
+	 * force sync if not pageout process
+	 */
+
+	if (object->type != OBJT_SWAP) {
+		swp_pager_meta_build(object, 0, SWAPBLK_NONE, 0);
 	}
 
 	if (curproc != pageproc)
 		sync = TRUE;
 
-	object = m[0]->object;
-	paging_pindex = OFF_TO_IDX(object->paging_offset);
-
-	failed = 0;
-	for (j = 0; j < count; j++) {
-		fidx = m[j]->pindex + paging_pindex;
-		ix = swap_pager_block_index(fidx);
-		swb[j] = 0;
-		if (ix >= object->un_pager.swp.swp_nblocks) {
-			rtvals[j] = VM_PAGER_FAIL;
-			failed = 1;
-			continue;
-		} else {
-			rtvals[j] = VM_PAGER_OK;
-		}
-		swb[j] = &object->un_pager.swp.swp_blocks[ix];
-		swb[j]->swb_locked++;
-		if (failed) {
-			rtvals[j] = VM_PAGER_FAIL;
-			continue;
-		}
-		off = swap_pager_block_offset(fidx);
-		reqaddr[j] = swb[j]->swb_block[off];
-		if (reqaddr[j] == SWB_EMPTY) {
-			daddr_t blk;
-			int tries;
-			int ntoget;
+	/*
+	 * Step 2
+	 *
+	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
+	 * The page is left dirty until the pageout operation completes
+	 * successfully.
+	 */
 
-			tries = 0;
-			s = splvm();
+	for (i = 0; i < count; i += n) {
+		int s;
+		int j;
+		struct buf *bp;
+		daddr_t blk;
 
-			/*
-			 * if any other pages have been allocated in this
-			 * block, we only try to get one page.
-			 */
-			for (i = 0; i < SWB_NPAGES; i++) {
-				if (swb[j]->swb_block[i] != SWB_EMPTY)
-					break;
-			}
+		/*
+		 * Maximum I/O size is limited by a number of factors.
+		 */
 
-			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
-			/*
-			 * this code is alittle conservative, but works (the
-			 * intent of this code is to allocate small chunks for
-			 * small objects)
-			 */
-			if ((off == 0) && ((fidx + ntoget) > object->size)) {
-				ntoget = object->size - fidx;
-			}
-	retrygetspace:
-			if (!swap_pager_full && ntoget > 1 &&
-			    swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE),
-				&blk)) {
-
-				for (i = 0; i < ntoget; i++) {
-					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
-					swb[j]->swb_valid = 0;
-				}
+		n = min(BLIST_MAX_ALLOC, count - i);
+		n = min(n, max_pageout_cluster);
 
-				reqaddr[j] = swb[j]->swb_block[off];
-			} else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE),
-				&swb[j]->swb_block[off])) {
-				/*
-				 * if the allocation has failed, we try to
-				 * reclaim space and retry.
-				 */
-				if (++tries == 1) {
-					swap_pager_reclaim();
-					goto retrygetspace;
-				}
-				rtvals[j] = VM_PAGER_AGAIN;
-				failed = 1;
-				swap_pager_full = 1;
-			} else {
-				reqaddr[j] = swb[j]->swb_block[off];
-				swb[j]->swb_valid &= ~(1 << off);
+		/*
+		 * Get biggest block of swap we can.  If we fail, fall
+		 * back and try to allocate a smaller block.  Don't go
+		 * overboard trying to allocate space if it would overly
+		 * fragment swap.
+		 */
+		while (
+		    (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
+		    n > 4
+		) {
+			n >>= 1;
+		}
+		if (blk == SWAPBLK_NONE) {
+			for (j = 0; j < n; ++j) {
+				rtvals[i+j] = VM_PAGER_FAIL;
 			}
-			splx(s);
+			grv = VM_PAGER_FAIL;
+			continue;
 		}
-	}
 
-	/*
-	 * search forwards for the last contiguous page to transfer
-	 */
-	failed = 0;
-	for (i = 0; i < count; i++) {
-		if (failed ||
-			(reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) ||
-		    ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) ||
-		    (rtvals[i] != VM_PAGER_OK)) {
-			failed = 1;
-			if (rtvals[i] == VM_PAGER_OK)
-				rtvals[i] = VM_PAGER_AGAIN;
+		/*
+		 * Oops, too big if it crosses a stripe
+		 *
+		 * 1111000000
+		 *     111111
+		 *    1000001
+		 */
+		if ((blk ^ (blk + n)) & dmmax_mask) {
+			j = ((blk + dmmax) & dmmax_mask) - blk;
+			swp_pager_freeswapspace(blk + j, n - j);
+			n = j;
 		}
-	}
 
-	ix = 0;
-	firstidx = -1;
-	for (i = 0; i < count; i++) {
-		if (rtvals[i] == VM_PAGER_OK) {
-			ix++;
-			if (firstidx == -1) {
-				firstidx = i;
-			}
-		} else if (firstidx >= 0) {
-			break;
-		}
-	}
+		/*
+		 * All I/O parameters have been satisfied, build the I/O
+		 * request and assign the swap space.
+		 *
+		 * NOTE: B_PAGING is set by pbgetvp()
+		 */
 
-	if (firstidx == -1) {
-		for (i = 0; i < count; i++) {
-			if (rtvals[i] == VM_PAGER_OK)
-				rtvals[i] = VM_PAGER_AGAIN;
-		}
-		return VM_PAGER_AGAIN;
-	}
+		bp = getpbuf(&nsw_wcount);
+		bp->b_spc = NULL;	/* not used, but NULL-out anyway */
 
-	lastidx = firstidx + ix;
+		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 
-	if (ix > max_pageout_cluster) {
-		for (i = firstidx + max_pageout_cluster; i < lastidx; i++) {
-			if (rtvals[i] == VM_PAGER_OK)
-				rtvals[i] = VM_PAGER_AGAIN;
-		}
-		ix = max_pageout_cluster;
-		lastidx = firstidx + ix;
-	}
+		bp->b_flags = B_BUSY | B_ASYNC;
+		bp->b_proc = &proc0; /* XXX (but without B_PHYS this is ok) */
+		bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 
-	for (i = 0; i < firstidx; i++) {
-		if (swb[i])
-			swb[i]->swb_locked--;
-	}
+		if (bp->b_rcred != NOCRED)
+			crhold(bp->b_rcred);
+		if (bp->b_wcred != NOCRED)
+			crhold(bp->b_wcred);
+		pbgetvp(swapdev_vp, bp);
 
-	for (i = lastidx; i < count; i++) {
-		if (swb[i])
-			swb[i]->swb_locked--;
-	}
+		bp->b_bcount = PAGE_SIZE * n;
+		bp->b_bufsize = PAGE_SIZE * n;
+		bp->b_blkno = blk;
 
-#ifdef INVARIANTS
-	for (i = firstidx; i < lastidx; i++) {
-		if (reqaddr[i] == SWB_EMPTY) {
-			printf("I/O to empty block???? -- pindex: %d, i: %d\n",
-				m[i]->pindex, i);
-		}
-	}
-#endif
+		s = splvm();
 
-	/*
-	 * Clean up all completed async pageouts.
-	 */
-	if (swap_pager_free_pending)
-		swap_pager_sync();
+		for (j = 0; j < n; ++j) {
+			vm_page_t mreq = m[i+j];
 
-	/*
-	 * get a swap pager clean data structure, block until we get it
-	 */
-	if (curproc == pageproc) {
-		if (swap_pager_free_count == 0) {
-			s = splvm();
-			while (swap_pager_free_count == 0) {
-				swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT;
-			/*
-			 * if it does not get one within a short time, then
-			 * there is a potential deadlock, so we go-on trying
-			 * to free pages.  It is important to block here as opposed
-			 * to returning, thereby allowing the pageout daemon to continue.
-			 * It is likely that pageout daemon will start suboptimally
-			 * reclaiming vnode backed pages if we don't block.  Since the
-			 * I/O subsystem is probably already fully utilized, might as
-			 * well wait.
-			 */
-				if (tsleep(&swap_pager_needflags, PVM-1, "swpfre", hz/2)) {
-					if (swap_pager_free_pending)
-						swap_pager_sync();
-					if (swap_pager_free_count == 0) {
-						for (i = firstidx; i < lastidx; i++) {
-							rtvals[i] = VM_PAGER_AGAIN;
-						}
-						splx(s);
-						return VM_PAGER_AGAIN;
-					}
-				} else {
-					swap_pager_sync();
-				}
-			}
-			splx(s);
+			swp_pager_meta_build(
+			    mreq->object, 
+			    mreq->pindex,
+			    blk + j,
+			    0
+			);
+			mreq->dirty = VM_PAGE_BITS_ALL;
+			rtvals[i+j] = VM_PAGER_OK;
+
+			vm_page_flag_set(mreq, PG_SWAPINPROG);
+			bp->b_pages[j] = mreq;
 		}
+		bp->b_flags |= B_CALL;
+		bp->b_npages = n;
 
-		spc = TAILQ_FIRST(&swap_pager_free);
-		KASSERT(spc != NULL,
-		    ("swap_pager_putpages: free queue is empty, %d expected\n",
-		    swap_pager_free_count));
-		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
-		swap_pager_free_count--;
-
-		kva = spc->spc_kva;
-		bp = spc->spc_bp;
-		bzero(bp, sizeof *bp);
-		bp->b_spc = spc;
-		bp->b_xflags = 0;
-		bp->b_data = (caddr_t) kva;
-	} else {
-		spc = NULL;
-		bp = getpbuf();
-		kva = (vm_offset_t) bp->b_data;
-		bp->b_spc = NULL;
-	}
+		cnt.v_swapout++;
+		cnt.v_swappgsout += bp->b_npages;
+		swapdev_vp->v_numoutput++;
 
-	/*
-	 * map our page(s) into kva for I/O
-	 */
-	pmap_qenter(kva, &m[firstidx], ix);
+		/*
+		 * asynchronous
+		 *
+		 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
+		 */
+
+		if (sync == FALSE) {
+			bp->b_iodone = swp_pager_async_iodone;
+			bp->b_dirtyoff = 0;
+			bp->b_dirtyend = bp->b_bcount;
+			VOP_STRATEGY(bp->b_vp, bp);
+
+			for (j = 0; j < n; ++j)
+				rtvals[i+j] = VM_PAGER_PEND;
+
+			splx(s);
+			grv = VM_PAGER_PEND;
+			continue;
+		}
 
-	/*
-	 * get the base I/O offset into the swap file
-	 */
-	for (i = firstidx; i < lastidx ; i++) {
-		fidx = m[i]->pindex + paging_pindex;
-		off = swap_pager_block_offset(fidx);
 		/*
-		 * set the valid bit
+		 * synchronous
+		 *
+		 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
 		 */
-		swb[i]->swb_valid |= (1 << off);
+
+		bp->b_iodone = swp_pager_sync_iodone;
+		VOP_STRATEGY(bp->b_vp, bp);
+
 		/*
-		 * and unlock the data structure
+		 * Wait for the sync I/O to complete, then update rtvals.
+		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
+		 * our async completion routine at the end, thus avoiding a
+		 * double-free.
 		 */
-		swb[i]->swb_locked--;
-	}
+		while ((bp->b_flags & B_DONE) == 0) {
+			tsleep(bp, PVM, "swwrt", 0);
+		}
 
-	bp->b_flags = B_BUSY | B_PAGING;
-	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
-	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
-	if (bp->b_rcred != NOCRED)
-		crhold(bp->b_rcred);
-	if (bp->b_wcred != NOCRED)
-		crhold(bp->b_wcred);
-	bp->b_blkno = reqaddr[firstidx];
-	pbgetvp(swapdev_vp, bp);
+		if (bp->b_flags & B_ERROR) {
+			grv = VM_PAGER_ERROR;
+		}
 
-	bp->b_bcount = PAGE_SIZE * ix;
-	bp->b_bufsize = PAGE_SIZE * ix;
+		for (j = 0; j < n; ++j)
+			rtvals[i+j] = VM_PAGER_PEND;
 
-	s = splvm();
-	swapdev_vp->v_numoutput++;
+		if (bp->b_flags & B_ERROR) {
+			grv = VM_PAGER_ERROR;
+		}
 
-	/*
-	 * If this is an async write we set up additional buffer fields and
-  	 * place a "cleaning" entry on the inuse queue.
-  	 */
- 	object->un_pager.swp.swp_poip++;
- 
- 	if (spc) {
-  		spc->spc_flags = 0;
-  		spc->spc_object = object;
- 		bp->b_npages = ix;
- 		for (i = firstidx; i < lastidx; i++) {
-  			spc->spc_m[i] = m[i];
- 			bp->b_pages[i - firstidx] = m[i];
- 			vm_page_protect(m[i], VM_PROT_READ);
- 			pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
- 			m[i]->dirty = 0;
- 		}
-  		spc->spc_first = firstidx;
-  		spc->spc_count = ix;
 		/*
-		 * the completion routine for async writes
+		 * Now that we are through with the bp, we can call the
+		 * normal async completion, which frees everything up.
 		 */
-		bp->b_flags |= B_CALL;
-		bp->b_iodone = swap_pager_iodone;
-		bp->b_dirtyoff = 0;
-		bp->b_dirtyend = bp->b_bcount;
-		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
-	} else {
-		bp->b_flags |= B_CALL;
-		bp->b_iodone = swap_pager_iodone1;
-		bp->b_npages = ix;
-		for (i = firstidx; i < lastidx; i++)
-			bp->b_pages[i - firstidx] = m[i];
-	}
 
-	cnt.v_swapout++;
-	cnt.v_swappgsout += ix;
+		swp_pager_async_iodone(bp);
 
-	/*
-	 * perform the I/O
-	 */
-	VOP_STRATEGY(bp->b_vp, bp);
-	if (sync == FALSE) {
-		if (swap_pager_free_pending) {
-			swap_pager_sync();
-		}
-		for (i = firstidx; i < lastidx; i++) {
-			rtvals[i] = VM_PAGER_PEND;
-		}
 		splx(s);
-		return VM_PAGER_PEND;
 	}
 
+	return(grv);
+}
+
+/*
+ *	swap_pager_sync_iodone:
+ *
+ *	Completion routine for synchronous reads and writes from/to swap.
+ *	We just mark the bp is complete and wake up anyone waiting on it.
+ *
+ *	This routine may not block.
+ */
+
+static void
+swp_pager_sync_iodone(bp)
+	struct buf *bp;
+{
+	bp->b_flags |= B_DONE;
+	bp->b_flags &= ~B_ASYNC;
+	wakeup(bp);
+}
+
+/*
+ *	swp_pager_async_iodone:
+ *
+ *	Completion routine for asynchronous reads and writes from/to swap.
+ *	Also called manually by synchronous code to finish up a bp.
+ *
+ *	WARNING!  This routine may be called from an interrupt.  We cannot
+ *	mess with swap metadata unless we want to run all our other routines
+ *	at splbio() too, which I'd rather not do.  We up ourselves
+ * 	to splvm() because we may call vm_page_free(), which can unlink a
+ *	page from an object.
+ *
+ *	XXX currently I do not believe any object routines protect 
+ *	object->memq at splvm().  The code must be gone over to determine
+ *	the actual state of the problem.
+ *
+ *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
+ *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
+ *	unbusy all pages except the 'main' request page.  For WRITE 
+ *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
+ *	because we marked them all VM_PAGER_PEND on return from putpages ).
+ *
+ *	This routine may not block.
+ *	This routine is called at splbio()
+ */
+
+static void
+swp_pager_async_iodone(bp)
+	register struct buf *bp;
+{
+	int s;
+	int i;
+	vm_object_t object = NULL;
+
+	s = splvm();
+
+	bp->b_flags |= B_DONE;
+
 	/*
-	 * wait for the sync I/O to complete
+	 * report error
 	 */
-	while ((bp->b_flags & B_DONE) == 0) {
-		tsleep(bp, PVM, "swwrt", 0);
-	}
 
 	if (bp->b_flags & B_ERROR) {
 		printf(
-"swap_pager: I/O error - pageout failed; blkno %ld, size %ld, error %d\n",
-		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
-		rv = VM_PAGER_ERROR;
-	} else {
-		rv = VM_PAGER_OK;
+		    "swap_pager: I/O error - %s failed; blkno %ld,"
+			"size %ld, error %d\n",
+		    ((bp->b_flags & B_READ) ? "pagein" : "pageout"),
+		    (long)bp->b_blkno, 
+		    (long)bp->b_bcount,
+		    bp->b_error
+		);
 	}
 
-	object->un_pager.swp.swp_poip--;
-	if (object->un_pager.swp.swp_poip == 0)
-		wakeup(object);
-
-	if (bp->b_vp)
-		pbrelvp(bp);
+	/*
+	 * set object.
+	 */
 
-	splx(s);
+	if (bp->b_npages)
+		object = bp->b_pages[0]->object;
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
-	pmap_qremove(kva, ix);
+
+	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 
 	/*
-	 * if we have written the page, then indicate that the page is clean.
+	 * cleanup pages.  If an error occurs writing to swap, we are in
+	 * very serious trouble.  If it happens to be a disk error, though,
+	 * we may be able to recover by reassigning the swap later on.  So
+	 * in this case we remove the m->swapblk assignment for the page 
+	 * but do not free it in the rlist.  The errornous block(s) are thus
+	 * never reallocated as swap.  Redirty the page and continue.
 	 */
-	if (rv == VM_PAGER_OK) {
-		for (i = firstidx; i < lastidx; i++) {
-			if (rtvals[i] == VM_PAGER_OK) {
-				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
-				m[i]->dirty = 0;
+
+	for (i = 0; i < bp->b_npages; ++i) {
+		vm_page_t m = bp->b_pages[i];
+
+		vm_page_flag_clear(m, PG_SWAPINPROG);
+
+		if (bp->b_flags & B_ERROR) {
+			/*
+			 * If an error occurs I'd love to throw the swapblk
+			 * away without freeing it back to swapspace, so it
+			 * can never be used again.  But I can't from an 
+			 * interrupt.
+			 */
+
+			if (bp->b_flags & B_READ) {
 				/*
-				 * optimization, if a page has been read
-				 * during the pageout process, we activate it.
+				 * When reading, reqpage needs to stay
+				 * locked for the parent, but all other
+				 * pages can be freed.  We still want to
+				 * wakeup the parent waiting on the page,
+				 * though.  ( also: pg_reqpage can be -1 and 
+				 * not match anything ).
+				 *
+				 * We have to wake specifically requested pages
+				 * up too because we cleared PG_SWAPINPROG and
+				 * someone may be waiting for that.
+				 *
+				 * NOTE: for reads, m->dirty will probably
+				 * be overriden by the original caller of
+				 * getpages so don't play cute tricks here.
+				 *
+				 * XXX it may not be legal to free the page
+				 * here as this messes with the object->memq's.
 				 */
-				if (((m[i]->flags & (PG_WANTED|PG_REFERENCED)) ||
-				    pmap_ts_referenced(VM_PAGE_TO_PHYS(m[i])))) {
-					vm_page_activate(m[i]);
-				}
+
+				m->valid = 0;
+				vm_page_flag_clear(m, PG_ZERO);
+
+				if (i != bp->b_pager.pg_reqpage)
+					vm_page_free(m);
+				else
+					vm_page_flash(m);
+				/*
+				 * If i == bp->b_pager.pg_reqpage, do not wake 
+				 * the page up.  The caller needs to.
+				 */
+			} else {
+				/*
+				 * If a write error occurs, reactivate page
+				 * so it doesn't clog the inactive list,
+				 * then finish the I/O.
+				 */
+				m->dirty = VM_PAGE_BITS_ALL;
+				vm_page_activate(m);
+				vm_page_io_finish(m);
 			}
-		}
-	} else {
-		for (i = firstidx; i < lastidx; i++) {
-			rtvals[i] = rv;
+		} else if (bp->b_flags & B_READ) {
+			/*
+			 * For read success, clear dirty bits.  Nobody should
+			 * have this page mapped but don't take any chances,
+			 * make sure the pmap modify bits are also cleared.
+			 *
+			 * NOTE: for reads, m->dirty will probably be 
+			 * overriden by the original caller of getpages so
+			 * we cannot set them in order to free the underlying
+			 * swap in a low-swap situation.  I don't think we'd
+			 * want to do that anyway, but it was an optimization
+			 * that existed in the old swapper for a time before
+			 * it got ripped out due to precisely this problem.
+			 *
+			 * clear PG_ZERO in page.
+			 *
+			 * If not the requested page then deactivate it.
+			 *
+			 * Note that the requested page, reqpage, is left
+			 * busied, but we still have to wake it up.  The
+			 * other pages are released (unbusied) by 
+			 * vm_page_wakeup().  We do not set reqpage's
+			 * valid bits here, it is up to the caller.
+			 */
+
+			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+			m->valid = VM_PAGE_BITS_ALL;
+			m->dirty = 0;
+			vm_page_flag_clear(m, PG_ZERO);
+
+			/*
+			 * We have to wake specifically requested pages
+			 * up too because we cleared PG_SWAPINPROG and
+			 * could be waiting for it in getpages.  However,
+			 * be sure to not unbusy getpages specifically
+			 * requested page - getpages expects it to be 
+			 * left busy.
+			 */
+			if (i != bp->b_pager.pg_reqpage) {
+				vm_page_deactivate(m);
+				vm_page_wakeup(m);
+			} else {
+				vm_page_flash(m);
+			}
+		} else {
+			/*
+			 * For write success, clear the modify and dirty 
+			 * status, then finish the I/O ( which decrements the 
+			 * busy count and possibly wakes waiter's up ).
+			 */
+			vm_page_protect(m, VM_PROT_READ);
+			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+			m->dirty = 0;
+			vm_page_io_finish(m);
 		}
 	}
 
-	if (spc != NULL) {
-		if (bp->b_rcred != NOCRED)
-			crfree(bp->b_rcred);
-		if (bp->b_wcred != NOCRED)
-			crfree(bp->b_wcred);
-		spc_free(spc);
-	} else
-		relpbuf(bp);
-	if (swap_pager_free_pending)
-		swap_pager_sync();
-
-	return (rv);
+	/*
+	 * adjust pip.  NOTE: the original parent may still have its own
+	 * pip refs on the object.
+	 */
+
+	if (object)
+		vm_object_pip_wakeupn(object, bp->b_npages);
+
+	/*
+	 * release the physical I/O buffer
+	 */
+
+	relpbuf(bp, ((bp->b_flags & B_READ) ? &nsw_rcount : &nsw_wcount));
+
+	splx(s);
 }
 
-void
-swap_pager_sync()
+/************************************************************************
+ *				SWAP META DATA 				*
+ ************************************************************************
+ *
+ *	These routines manipulate the swap metadata stored in the 
+ *	OBJT_SWAP object.
+ *
+ *	In fact, we just have a few counters in the vm_object_t.  The
+ *	metadata is actually stored in a hash table.
+ */
+
+/*
+ * SWP_PAGER_HASH() -	hash swap meta data
+ *
+ *	This is an inline helper function which hash the swapblk given
+ *	the object and page index.  It returns a pointer to a pointer
+ *	to the object, or a pointer to a NULL pointer if it could not
+ *	find a swapblk.
+ */
+
+static __inline struct swblock **
+swp_pager_hash(vm_object_t object, daddr_t index)
 {
-	swp_clean_t spc;
+	struct swblock **pswap;
+	struct swblock *swap;
+
+	index &= ~SWAP_META_MASK;
+	pswap = &swhash[(index ^ (int)(long)object) & swhash_mask];
 
-	while (spc = TAILQ_FIRST(&swap_pager_done)) {
-		swap_pager_finish(spc);
+	while ((swap = *pswap) != NULL) {
+		if (swap->swb_object == object &&
+		    swap->swb_index == index
+		) {
+			break;
+		}
+		pswap = &swap->swb_hnext;
 	}
-	return;
+	return(pswap);
 }
 
+/*
+ * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
+ *
+ *	We first convert the object to a swap object if it is a default
+ *	object.
+ *
+ *	The specified swapblk is added to the object's swap metadata.  If
+ *	the swapblk is not valid, it is freed instead.  Any previously
+ *	assigned swapblk is freed.
+ */
+
 static void
-swap_pager_finish(spc)
-	register swp_clean_t spc;
-{
-	int i, s, lastidx;
-	vm_object_t object;
-	vm_page_t *ma;
+swp_pager_meta_build(
+	vm_object_t object, 
+	daddr_t index, 
+	daddr_t swapblk, 
+	int waitok
+) {
+	struct swblock *swap;
+	struct swblock **pswap;
 
-	ma = spc->spc_m;
-	object = spc->spc_object;
-	lastidx = spc->spc_first + spc->spc_count;
+	/*
+	 * Convert default object to swap object if necessary
+	 */
 
-	s = splvm();
-	TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
-	splx(s);
+	if (object->type != OBJT_SWAP) {
+		object->type = OBJT_SWAP;
+		object->un_pager.swp.swp_bcount = 0;
+
+		if (object->handle != NULL) {
+			TAILQ_INSERT_TAIL(
+			    NOBJLIST(object->handle),
+			    object, 
+			    pager_object_list
+			);
+		} else {
+			TAILQ_INSERT_TAIL(
+			    &swap_pager_un_object_list,
+			    object, 
+			    pager_object_list
+			);
+		}
+	}
+	
+	/*
+	 * Wait for free memory when waitok is TRUE prior to calling the
+	 * zone allocator.
+	 */
 
-	pmap_qremove(spc->spc_kva, spc->spc_count);
+	while (waitok && cnt.v_free_count == 0) {
+		VM_WAIT;
+	}
 
 	/*
-	 * If no error, mark as clean and inform the pmap system. If error,
-	 * mark as dirty so we will try again. (XXX could get stuck doing
-	 * this, should give up after awhile)
+	 * If swapblk being added is invalid, just free it.
 	 */
-	if (spc->spc_flags & SPC_ERROR) {
 
-		for (i = spc->spc_first; i < lastidx; i++) {
-			printf("swap_pager_finish: I/O error, clean of page %lx failed\n",
-			    (u_long) VM_PAGE_TO_PHYS(ma[i]));
-			ma[i]->dirty = VM_PAGE_BITS_ALL;
-			vm_page_io_finish(ma[i]);
+	if (swapblk & SWAPBLK_NONE) {
+		if (swapblk != SWAPBLK_NONE) {
+			swp_pager_freeswapspace(
+			    index,
+			    1
+			);
+			swapblk = SWAPBLK_NONE;
 		}
+	}
 
-		vm_object_pip_subtract(object, spc->spc_count);
-		if ((object->paging_in_progress == 0) &&
-			(object->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(object, OBJ_PIPWNT);
-			wakeup(object);
-		}
+	/*
+	 * Locate hash entry.  If not found create, but if we aren't adding
+	 * anything just return.
+	 */
 
-	} else {
-		for (i = spc->spc_first; i < lastidx; i++) {
-			if ((ma[i]->queue != PQ_ACTIVE) &&
-			   ((ma[i]->flags & PG_WANTED) ||
-				 pmap_ts_referenced(VM_PAGE_TO_PHYS(ma[i])))) {
-				vm_page_activate(ma[i]);
-			}
-		}
+	pswap = swp_pager_hash(object, index);
+
+	if ((swap = *pswap) == NULL) {
+		int i;
+
+		if (swapblk == SWAPBLK_NONE)
+			return;
+
+		swap = *pswap = zalloc(swap_zone);
+
+		swap->swb_hnext = NULL;
+		swap->swb_object = object;
+		swap->swb_index = index & ~SWAP_META_MASK;
+		swap->swb_count = 0;
+
+		++object->un_pager.swp.swp_bcount;
+
+		for (i = 0; i < SWAP_META_PAGES; ++i)
+			swap->swb_pages[i] = SWAPBLK_NONE;
 	}
 
-	nswiodone -= spc->spc_count;
-	swap_pager_free_pending--;
-	spc_free(spc);
+	/*
+	 * Delete prior contents of metadata
+	 */
 
-	return;
+	index &= SWAP_META_MASK;
+
+	if (swap->swb_pages[index] != SWAPBLK_NONE) {
+		swp_pager_freeswapspace(
+		    swap->swb_pages[index] & SWAPBLK_MASK,
+		    1
+		);
+		--swap->swb_count;
+	}
+
+	/*
+	 * Enter block into metadata
+	 */
+
+	swap->swb_pages[index] = swapblk;
+	++swap->swb_count;
 }
 
 /*
- * swap_pager_iodone
+ * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
+ *
+ *	The requested range of blocks is freed, with any associated swap 
+ *	returned to the swap bitmap.
+ *
+ *	This routine will free swap metadata structures as they are cleaned 
+ *	out.  This routine does *NOT* operate on swap metadata associated
+ *	with resident pages.
+ *
+ *	This routine must be called at splvm()
  */
+
 static void
-swap_pager_iodone(bp)
-	register struct buf *bp;
+swp_pager_meta_free(vm_object_t object, daddr_t index, daddr_t count)
 {
-	int i, s, lastidx;
-	register swp_clean_t spc;
-	vm_object_t object;
-	vm_page_t *ma;
+	if (object->type != OBJT_SWAP)
+		return;
 
+	while (count > 0) {
+		struct swblock **pswap;
+		struct swblock *swap;
 
-	s = splvm();
-	spc = (swp_clean_t) bp->b_spc;
-	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
-	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
+		pswap = swp_pager_hash(object, index);
 
-	object = spc->spc_object;
+		if ((swap = *pswap) != NULL) {
+			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 
-#if defined(DIAGNOSTIC)
-	if (object->paging_in_progress < spc->spc_count)
-		printf("swap_pager_iodone: paging_in_progress(%d) < spc_count(%d)\n",
-			object->paging_in_progress, spc->spc_count);
-#endif
-
-	if (bp->b_flags & B_ERROR) {
-		spc->spc_flags |= SPC_ERROR;
-		printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n",
-		    (bp->b_flags & B_READ) ? "pagein" : "pageout",
-		    (u_long) bp->b_blkno, bp->b_bcount, bp->b_error);
-	} else {
-		vm_object_pip_subtract(object, spc->spc_count);
-		if ((object->paging_in_progress == 0) &&
-			(object->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(object, OBJ_PIPWNT);
-			wakeup(object);
-		}
-		ma = spc->spc_m;
-		lastidx = spc->spc_first + spc->spc_count;
-		for (i = spc->spc_first; i < lastidx; i++) {
-			/*
-			 * we wakeup any processes that are waiting on these pages.
-			 */
-			vm_page_io_finish(ma[i]);
+			if (v != SWAPBLK_NONE) {
+				swp_pager_freeswapspace(v, 1);
+				swap->swb_pages[index & SWAP_META_MASK] =
+					SWAPBLK_NONE;
+				if (--swap->swb_count == 0) {
+					*pswap = swap->swb_hnext;
+					zfree(swap_zone, swap);
+					--object->un_pager.swp.swp_bcount;
+				}
+			}
+			--count;
+			++index;
+		} else {
+			daddr_t n = SWAP_META_PAGES - (index & SWAP_META_MASK);
+			count -= n;
+			index += n;
 		}
 	}
+}
+
+/*
+ * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
+ *
+ *	This routine locates and destroys all swap metadata associated with
+ *	an object.
+ */
+
+static void
+swp_pager_meta_free_all(vm_object_t object)
+{
+	daddr_t index = 0;
 
-	if (bp->b_vp)
-		pbrelvp(bp);
+	if (object->type != OBJT_SWAP)
+		return;
 
-	if (bp->b_rcred != NOCRED)
-		crfree(bp->b_rcred);
-	if (bp->b_wcred != NOCRED)
-		crfree(bp->b_wcred);
+	while (object->un_pager.swp.swp_bcount) {
+		struct swblock **pswap;
+		struct swblock *swap;
 
-	nswiodone += spc->spc_count;
-	swap_pager_free_pending++;
-	if (--spc->spc_object->un_pager.swp.swp_poip == 0) {
-		wakeup(spc->spc_object);
-	}
+		pswap = swp_pager_hash(object, index);
+		if ((swap = *pswap) != NULL) {
+			int i;
 
-	if (swap_pager_needflags &&
-	  ((swap_pager_free_count + swap_pager_free_pending) > (npendingio / 2))) {
-		spc_wakeup();
+			for (i = 0; i < SWAP_META_PAGES; ++i) {
+				daddr_t v = swap->swb_pages[i];
+				if (v != SWAPBLK_NONE) {
+#if !defined(MAX_PERF)
+					--swap->swb_count;
+#endif
+					swp_pager_freeswapspace(
+					    v,
+					    1
+					);
+				}
+			}
+#if !defined(MAX_PERF)
+			if (swap->swb_count != 0)
+				panic("swap_pager_meta_free_all: swb_count != 0");
+#endif
+			*pswap = swap->swb_hnext;
+			zfree(swap_zone, swap);
+			--object->un_pager.swp.swp_bcount;
+		}
+		index += SWAP_META_PAGES;
+#if !defined(MAX_PERF)
+		if (index > 0x20000000)
+			panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
+#endif
 	}
+}
 
-	if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) &&
-		vm_pageout_pages_needed) {
-		wakeup(&vm_pageout_pages_needed);
-		vm_pageout_pages_needed = 0;
+/*
+ * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
+ *
+ *	This routine is capable of looking up, popping, or freeing
+ *	swapblk assignments in the swap meta data or in the vm_page_t.
+ *	The routine typically returns the swapblk being looked-up, or popped,
+ *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
+ *	was invalid.  This routine will automatically free any invalid 
+ *	meta-data swapblks.
+ *
+ *	It is not possible to store invalid swapblks in the swap meta data
+ *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
+ *
+ *	When acting on a busy resident page and paging is in progress, we 
+ *	have to wait until paging is complete but otherwise can act on the 
+ *	busy page.
+ *
+ *	SWM_FREE	remove and free swap block from metadata
+ *
+ *	SWM_POP		remove from meta data but do not free.. pop it out
+ */
+
+static daddr_t
+swp_pager_meta_ctl(
+	vm_object_t object,
+	vm_pindex_t index,
+	int flags
+) {
+	/*
+	 * The meta data only exists of the object is OBJT_SWAP 
+	 * and even then might not be allocated yet.
+	 */
+
+	if (
+	    object->type != OBJT_SWAP ||
+	    object->un_pager.swp.swp_bcount == 0
+	) {
+		return(SWAPBLK_NONE);
 	}
 
-	splx(s);
+	{
+		struct swblock **pswap;
+		struct swblock *swap;
+		daddr_t r1 = SWAPBLK_NONE;
+
+		pswap = swp_pager_hash(object, index);
+
+		index &= SWAP_META_MASK;
+
+		if ((swap = *pswap) != NULL) {
+			r1 = swap->swb_pages[index];
+
+			if (r1 != SWAPBLK_NONE) {
+				if (flags & SWM_FREE) {
+					swp_pager_freeswapspace(
+					    r1,
+					    1
+					);
+					r1 = SWAPBLK_NONE;
+				}
+				if (flags & (SWM_FREE|SWM_POP)) {
+					swap->swb_pages[index] = SWAPBLK_NONE;
+					if (--swap->swb_count == 0) {
+						*pswap = swap->swb_hnext;
+						zfree(swap_zone, swap);
+						--object->un_pager.swp.swp_bcount;
+					}
+				} 
+	 		}
+		}
+
+		return(r1);
+	}
+	/* not reached */
 }
+
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index ceb88b6..374223c 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)swap_pager.h	7.1 (Berkeley) 12/5/90
- *	$Id: swap_pager.h,v 1.21 1998/04/29 04:28:02 dyson Exp $
+ *	$Id: swap_pager.h,v 1.22 1998/07/10 21:50:17 alex Exp $
  */
 
 /*
@@ -59,26 +59,50 @@
 #define SWB_NPAGES 8
 #endif
 
+/*
+ * Piecemeal swap metadata structure.  Swap is stored in a radix tree.
+ *
+ * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix
+ * is basically 8.  Assuming PAGE_SIZE == 4096, one tree level represents
+ * 32K worth of data, two levels represent 256K, three levels represent
+ * 2 MBytes.   This is acceptable.
+ *
+ * Overall memory utilization is about the same as the old swap structure.
+ */
+
+#define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
+
+#define SWAP_META_PAGES		(SWB_NPAGES * 2)
+#define SWAP_META_MASK		(SWAP_META_PAGES - 1)
+
 struct swblock {
-	unsigned short swb_valid;	/* bitmask for valid pages */
-	unsigned short swb_locked;	/* block locked */
-	daddr_t swb_block[SWB_NPAGES];
+	struct swblock	*swb_hnext;
+	vm_object_t	swb_object;
+	int		swb_index;
+	int		swb_count;
+	daddr_t		swb_pages[SWAP_META_PAGES];
 };
-typedef struct swblock *sw_blk_t;
 
 #ifdef KERNEL
 extern struct pagerlst swap_pager_un_object_list;
 extern int swap_pager_full;
-extern struct rlisthdr swaplist;
+extern struct blist *swapblist;
+
+int swap_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
+boolean_t swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex, int *before, int *after));
 
-int swap_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));    
 int swap_pager_swp_alloc __P((vm_object_t, int));
-void swap_pager_copy __P((vm_object_t, vm_pindex_t, vm_object_t,
-	vm_pindex_t, vm_pindex_t, int));
+void swap_pager_copy __P((vm_object_t, vm_object_t, vm_pindex_t, int));
 void swap_pager_freespace __P((vm_object_t, vm_pindex_t, vm_size_t));
 void swap_pager_dmzspace __P((vm_object_t, vm_pindex_t, vm_size_t));
 void swap_pager_swap_init __P((void));
-void swap_pager_sync __P((void));
+
+/*
+ * newswap functions
+ */
+
+void swap_pager_page_removed __P((vm_page_t, vm_object_t));
+
 #endif
 
 #endif				/* _SWAP_PAGER_ */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index e3d64f9..d0f4754 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -66,7 +66,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_fault.c,v 1.92 1999/01/08 17:31:24 eivind Exp $
+ * $Id: vm_fault.c,v 1.93 1999/01/10 01:58:28 eivind Exp $
  */
 
 /*
@@ -114,7 +114,7 @@ struct faultstate {
 	struct vnode *vp;
 };
 
-static void
+static __inline void
 release_page(struct faultstate *fs)
 {
 	vm_page_wakeup(fs->m);
@@ -122,7 +122,7 @@ release_page(struct faultstate *fs)
 	fs->m = NULL;
 }
 
-static void
+static __inline void
 unlock_map(struct faultstate *fs)
 {
 	if (fs->lookup_still_valid) {
@@ -263,36 +263,43 @@ RetryFault:;
 	fs.object = fs.first_object;
 	fs.pindex = fs.first_pindex;
 
-	/*
-	 * See whether this page is resident
-	 */
 	while (TRUE) {
+		/*
+		 * If the object is dead, we stop here
+		 */
 
 		if (fs.object->flags & OBJ_DEAD) {
 			unlock_and_deallocate(&fs);
 			return (KERN_PROTECTION_FAILURE);
 		}
+
+		/*
+		 * See if page is resident
+		 */
 			
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
 			int queue, s;
 			/*
-			 * If the page is being brought in, wait for it and
-			 * then retry.
+			 * Wait/Retry if the page is busy.  We have to do this
+			 * if the page is busy via either PG_BUSY or 
+			 * vm_page_t->busy because the vm_pager may be using
+			 * vm_page_t->busy for pageouts ( and even pageins if
+			 * it is the vnode pager ), and we could end up trying
+			 * to pagein and pageout the same page simultaniously.
+			 *
+			 * We can theoretically allow the busy case on a read
+			 * fault if the page is marked valid, but since such
+			 * pages are typically already pmap'd, putting that
+			 * special case in might be more effort then it is 
+			 * worth.  We cannot under any circumstances mess
+			 * around with a vm_page_t->busy page except, perhaps,
+			 * to pmap it.
 			 */
-			if ((fs.m->flags & PG_BUSY) ||
-				(fs.m->busy &&
-				 (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
+			if ((fs.m->flags & PG_BUSY) || fs.m->busy) {
 				unlock_things(&fs);
-				s = splvm();
-				if ((fs.m->flags & PG_BUSY) ||
-					(fs.m->busy &&
-					 (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
-					vm_page_flag_set(fs.m, PG_WANTED | PG_REFERENCED);
-					cnt.v_intrans++;
-					tsleep(fs.m, PSWP, "vmpfw", 0);
-				}
-				splx(s);
+				(void)vm_page_sleep_busy(fs.m, TRUE, "vmpfw");
+				cnt.v_intrans++;
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
@@ -302,8 +309,12 @@ RetryFault:;
 			vm_page_unqueue_nowakeup(fs.m);
 			splx(s);
 
+#if 0
 			/*
-			 * Mark page busy for other processes, and the pagedaemon.
+			 * Code removed.  In a low-memory situation (say, a
+			 * memory-bound program is running), the last thing you
+			 * do is starve reactivations for other processes.
+			 * XXX we need to find a better way.
 			 */
 			if (((queue - fs.m->pc) == PQ_CACHE) &&
 			    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
@@ -312,6 +323,13 @@ RetryFault:;
 				VM_WAIT;
 				goto RetryFault;
 			}
+#endif
+			/*
+			 * Mark page busy for other processes, and the 
+			 * pagedaemon.  If it still isn't completely valid
+			 * (readable), jump to readrest, else break-out ( we
+			 * found the page ).
+			 */
 
 			vm_page_busy(fs.m);
 			if (((fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
@@ -321,6 +339,12 @@ RetryFault:;
 
 			break;
 		}
+
+		/*
+		 * Page is not resident, If this is the search termination,
+		 * allocate a new page.
+		 */
+
 		if (((fs.object->type != OBJT_DEFAULT) &&
 				(((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired))
 		    || (fs.object == fs.first_object)) {
@@ -344,6 +368,13 @@ RetryFault:;
 		}
 
 readrest:
+		/*
+		 * Have page, but it may not be entirely valid ( or valid at
+		 * all ).   If this object is not the default, try to fault-in
+		 * the page as well as activate additional pages when
+		 * appropriate, and page-in additional pages when appropriate.
+		 */
+
 		if (fs.object->type != OBJT_DEFAULT &&
 			(((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired)) {
 			int rv;
@@ -410,13 +441,16 @@ readrest:
 			 * vm_page_t passed to the routine.  The reqpage
 			 * return value is the index into the marray for the
 			 * vm_page_t passed to the routine.
+			 *
+			 * fs.m plus the additional pages are PG_BUSY'd.
 			 */
 			faultcount = vm_fault_additional_pages(
 			    fs.m, behind, ahead, marray, &reqpage);
 
 			/*
 			 * Call the pager to retrieve the data, if any, after
-			 * releasing the lock on the map.
+			 * releasing the lock on the map.  We hold a ref on
+			 * fs.object and the pages are PG_BUSY'd.
 			 */
 			unlock_map(&fs);
 
@@ -442,7 +476,7 @@ readrest:
 				}
 
 				hardfault++;
-				break;
+				break; /* break to PAGE HAS BEEN FOUND */
 			}
 			/*
 			 * Remove the bogus page (which does not exist at this
@@ -486,8 +520,8 @@ readrest:
 			}
 		}
 		/*
-		 * We get here if the object has default pager (or unwiring) or the
-		 * pager doesn't have the page.
+		 * We get here if the object has default pager (or unwiring) 
+		 * or the pager doesn't have the page.
 		 */
 		if (fs.object == fs.first_object)
 			fs.first_m = fs.m;
@@ -518,15 +552,17 @@ readrest:
 				cnt.v_ozfod++;
 			}
 			cnt.v_zfod++;
-			break;
+			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			if (fs.object != fs.first_object) {
 				vm_object_pip_wakeup(fs.object);
 			}
+			KASSERT(fs.object != next_object, ("object loop %p", next_object));
 			fs.object = next_object;
 			vm_object_pip_add(fs.object, 1);
 		}
 	}
+
 	KASSERT((fs.m->flags & PG_BUSY) != 0,
 	    ("vm_fault: not busy after main loop"));
 
@@ -549,14 +585,15 @@ readrest:
 		 */
 
 		if (fault_type & VM_PROT_WRITE) {
-
 			/*
-			 * This allows pages to be virtually copied from a backing_object
-			 * into the first_object, where the backing object has no other
-			 * refs to it, and cannot gain any more refs.  Instead of a
-			 * bcopy, we just move the page from the backing object to the
-			 * first object.  Note that we must mark the page dirty in the
-			 * first object so that it will go out to swap when needed.
+			 * This allows pages to be virtually copied from a 
+			 * backing_object into the first_object, where the 
+			 * backing object has no other refs to it, and cannot
+			 * gain any more refs.  Instead of a bcopy, we just 
+			 * move the page from the backing object to the 
+			 * first object.  Note that we must mark the page 
+			 * dirty in the first object so that it will go out 
+			 * to swap when needed.
 			 */
 			if (map_generation == fs.map->timestamp &&
 				/*
@@ -598,11 +635,12 @@ readrest:
 				fs.first_m = NULL;
 
 				/*
-				 * grab the page and put it into the process'es object
+				 * grab the page and put it into the 
+				 * process'es object.  The page is 
+				 * automatically made dirty.
 				 */
 				vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
 				fs.first_m = fs.m;
-				fs.first_m->dirty = VM_PAGE_BITS_ALL;
 				vm_page_busy(fs.first_m);
 				fs.m = NULL;
 				cnt.v_cow_optim++;
@@ -620,7 +658,13 @@ readrest:
 				release_page(&fs);
 			}
 
+			/*
+			 * fs.object != fs.first_object due to above 
+			 * conditional
+			 */
+
 			vm_object_pip_wakeup(fs.object);
+
 			/*
 			 * Only use the new page below...
 			 */
@@ -708,9 +752,13 @@ readrest:
 		 * If the fault is a write, we know that this page is being
 		 * written NOW. This will save on the pmap_is_modified() calls
 		 * later.
+		 *
+		 * Also tell the backing pager, if any, that it should remove
+		 * any swap backing since the page is now dirty.
 		 */
 		if (fault_flags & VM_FAULT_DIRTY) {
 			fs.m->dirty = VM_PAGE_BITS_ALL;
+			vm_pager_page_unswapped(fs.m);
 		}
 	}
 
@@ -1021,8 +1069,7 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 	 * if the requested page is not available, then give up now
 	 */
 
-	if (!vm_pager_has_page(object,
-		OFF_TO_IDX(object->paging_offset) + pindex, &cbehind, &cahead)) {
+	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
 		return 0;
 	}
 
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index ec844db..0a3309d 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -59,7 +59,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_glue.c,v 1.79 1998/12/19 08:23:31 julian Exp $
+ * $Id: vm_glue.c,v 1.80 1999/01/07 21:23:50 julian Exp $
  */
 
 #include "opt_rlimit.h"
@@ -213,10 +213,19 @@ vm_fork(p1, p2, flags)
 		p1->p_vmspace->vm_refcnt++;
 	}
 
+	/*
+	 * Great, so we have a memory-heavy process and the 
+	 * entire machine comes to a screaching halt because
+	 * nobody can fork/exec anything.  What we really need
+	 * to do is fix the process swapper so it swaps out the right
+	 * processes.
+	 */
+#if 0
 	while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
 		vm_pageout_deficit += (UPAGES + VM_INITIAL_PAGEIN);
 		VM_WAIT;
 	}
+#endif
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index ea7f45b..b2e1102 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_kern.c,v 1.49 1998/08/24 08:39:37 dfr Exp $
+ * $Id: vm_kern.c,v 1.50 1998/09/04 08:06:57 dfr Exp $
  */
 
 /*
@@ -181,8 +181,9 @@ kmem_alloc(map, size)
 				VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 		if ((mem->flags & PG_ZERO) == 0)
 			vm_page_zero_fill(mem);
-		vm_page_flag_clear(mem, (PG_BUSY | PG_ZERO));
 		mem->valid = VM_PAGE_BITS_ALL;
+		vm_page_flag_clear(mem, PG_ZERO);
+		vm_page_wakeup(mem);
 	}
 
 	/*
@@ -200,6 +201,8 @@ kmem_alloc(map, size)
  *	Release a region of kernel virtual memory allocated
  *	with kmem_alloc, and return the physical pages
  *	associated with that region.
+ *
+ *	This routine may not block on kernel maps.
  */
 void
 kmem_free(map, addr, size)
@@ -252,26 +255,31 @@ kmem_suballoc(parent, min, max, size)
 }
 
 /*
- * Allocate wired-down memory in the kernel's address map for the higher
- * level kernel memory allocator (kern/kern_malloc.c).  We cannot use
- * kmem_alloc() because we may need to allocate memory at interrupt
- * level where we cannot block (canwait == FALSE).
+ *	kmem_malloc:
+ *
+ * 	Allocate wired-down memory in the kernel's address map for the higher
+ * 	level kernel memory allocator (kern/kern_malloc.c).  We cannot use
+ * 	kmem_alloc() because we may need to allocate memory at interrupt
+ * 	level where we cannot block (canwait == FALSE).
+ *
+ * 	This routine has its own private kernel submap (kmem_map) and object
+ * 	(kmem_object).  This, combined with the fact that only malloc uses
+ * 	this routine, ensures that we will never block in map or object waits.
  *
- * This routine has its own private kernel submap (kmem_map) and object
- * (kmem_object).  This, combined with the fact that only malloc uses
- * this routine, ensures that we will never block in map or object waits.
+ * 	Note that this still only works in a uni-processor environment and
+ * 	when called at splhigh().
  *
- * Note that this still only works in a uni-processor environment and
- * when called at splhigh().
+ * 	We don't worry about expanding the map (adding entries) since entries
+ * 	for wired maps are statically allocated.
  *
- * We don't worry about expanding the map (adding entries) since entries
- * for wired maps are statically allocated.
+ *	NOTE:  This routine is not supposed to block if M_NOWAIT is set, but
+ *	I have not verified that it actually does not block.
  */
 vm_offset_t
-kmem_malloc(map, size, waitflag)
+kmem_malloc(map, size, flags)
 	register vm_map_t map;
 	register vm_size_t size;
-	boolean_t waitflag;
+	int flags;
 {
 	register vm_offset_t offset, i;
 	vm_map_entry_t entry;
@@ -297,7 +305,7 @@ kmem_malloc(map, size, waitflag)
 			printf("Out of mbuf clusters - adjust NMBCLUSTERS or increase maxusers!\n");
 			return (0);
 		}
-		if (waitflag == M_WAITOK)
+		if ((flags & M_NOWAIT) == 0)
 			panic("kmem_malloc(%d): kmem_map too small: %d total allocated",
 				size, map->size);
 		return (0);
@@ -308,9 +316,19 @@ kmem_malloc(map, size, waitflag)
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	for (i = 0; i < size; i += PAGE_SIZE) {
+		/*
+		 * Note: if M_NOWAIT specified alone, allocate from 
+		 * interrupt-safe queues only (just the free list).  If 
+		 * M_ASLEEP or M_USE_RESERVE is also specified, we can also
+		 * allocate from the cache.  Neither of the latter two
+		 * flags may be specified from an interrupt since interrupts
+		 * are not allowed to mess with the cache queue.
+		 */
 retry:
 		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i),
-			(waitflag == M_NOWAIT) ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM);
+		    ((flags & (M_NOWAIT|M_ASLEEP|M_USE_RESERVE)) == M_NOWAIT) ?
+			VM_ALLOC_INTERRUPT : 
+			VM_ALLOC_SYSTEM);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
@@ -318,7 +336,7 @@ retry:
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
-			if (waitflag == M_WAITOK) {
+			if ((flags & M_NOWAIT) == 0) {
 				VM_WAIT;
 				goto retry;
 			}
@@ -330,6 +348,9 @@ retry:
 			}
 			vm_map_delete(map, addr, addr + size);
 			vm_map_unlock(map);
+			if (flags & M_ASLEEP) {
+				VM_AWAIT;
+			}
 			return (0);
 		}
 		vm_page_flag_clear(m, PG_ZERO);
@@ -359,6 +380,9 @@ retry:
 		m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
 		vm_page_wire(m);
 		vm_page_wakeup(m);
+		/*
+		 * Because this is kernel_pmap, this call will not block.
+		 */
 		pmap_enter(kernel_pmap, addr + i, VM_PAGE_TO_PHYS(m),
 			VM_PROT_ALL, 1);
 		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_REFERENCED);
@@ -369,12 +393,14 @@ retry:
 }
 
 /*
- *	kmem_alloc_wait
+ *	kmem_alloc_wait:
  *
  *	Allocates pageable memory from a sub-map of the kernel.  If the submap
  *	has no room, the caller sleeps waiting for more memory in the submap.
  *
+ *	This routine may block.
  */
+
 vm_offset_t
 kmem_alloc_wait(map, size)
 	vm_map_t map;
@@ -406,7 +432,7 @@ kmem_alloc_wait(map, size)
 }
 
 /*
- *	kmem_free_wakeup
+ *	kmem_free_wakeup:
  *
  *	Returns memory to a submap of the kernel, and wakes up any processes
  *	waiting for memory in that map.
@@ -424,11 +450,14 @@ kmem_free_wakeup(map, addr, size)
 }
 
 /*
- * Create the kernel map; insert a mapping covering kernel text, data, bss,
- * and all space allocated thus far (`boostrap' data).  The new map will thus
- * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and
- * the range between `start' and `end' as free.
+ * 	kmem_init:
+ *
+ *	Create the kernel map; insert a mapping covering kernel text, 
+ *	data, bss, and all space allocated thus far (`boostrap' data).  The 
+ *	new map will thus map the range between VM_MIN_KERNEL_ADDRESS and 
+ *	`start' as allocated, and the range between `start' and `end' as free.
  */
+
 void
 kmem_init(start, end)
 	vm_offset_t start, end;
@@ -445,3 +474,4 @@ kmem_init(start, end)
 	/* ... and ending with the completion of the above `insert' */
 	vm_map_unlock(m);
 }
+
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 829548a..f495788 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_map.c,v 1.138 1998/10/25 17:44:58 phk Exp $
+ * $Id: vm_map.c,v 1.139 1999/01/06 23:05:41 julian Exp $
  */
 
 /*
@@ -440,7 +440,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	vm_map_entry_t new_entry;
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
+#if 0
 	vm_object_t prev_object;
+#endif
 	u_char protoeflags;
 
 	if ((object != NULL) && (cow & MAP_NOFAULT)) {
@@ -514,10 +516,15 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 
 				map->size += (end - prev_entry->end);
 				prev_entry->end = end;
+#if 0
+				/*
+				 * (no longer applies)
+				 */
 				if ((cow & MAP_NOFAULT) == 0) {
 					prev_object = prev_entry->object.vm_object;
 					default_pager_convert_to_swapq(prev_object);
 				}
+#endif
 				return (KERN_SUCCESS);
 			}
 			else {
@@ -573,7 +580,12 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 		(prev_entry->end >= new_entry->start))
 		map->first_free = new_entry;
 
+#if 0
+	/*
+	 * (no longer applies)
+	 */
 	default_pager_convert_to_swapq(object);
+#endif
 	return (KERN_SUCCESS);
 }
 
@@ -1504,7 +1516,12 @@ vm_map_user_pageable(map, start, end, new_pageable)
 					entry->offset = (vm_offset_t) 0;
 
 				}
+#if 0
+				/*
+				 * (no longer applies)
+				 */
 				default_pager_convert_to_swapq(entry->object.vm_object);
+#endif
 			}
 
 			vm_map_clip_start(map, entry, start);
@@ -1695,7 +1712,12 @@ vm_map_pageable(map, start, end, new_pageable)
 							atop(entry->end - entry->start));
 						entry->offset = (vm_offset_t) 0;
 					}
+#if 0
+					/*
+					 * (no longer applies)
+					 */
 					default_pager_convert_to_swapq(entry->object.vm_object);
+#endif
 				}
 			}
 			vm_map_clip_start(map, entry, start);
@@ -2192,16 +2214,18 @@ vm_map_split(entry)
 		m = vm_page_lookup(orig_object, offidxstart + idx);
 		if (m == NULL)
 			continue;
-		if (m->flags & PG_BUSY) {
-			vm_page_flag_set(m, PG_WANTED);
-			tsleep(m, PVM, "spltwt", 0);
+
+		/*
+		 * We must wait for pending I/O to complete before we can
+		 * rename the page.
+		 */
+		if (vm_page_sleep_busy(m, TRUE, "spltwt"))
 			goto retry;
-		}
 			
 		vm_page_busy(m);
 		vm_page_protect(m, VM_PROT_NONE);
 		vm_page_rename(m, new_object, idx);
-		m->dirty = VM_PAGE_BITS_ALL;
+		/* page automatically made dirty by rename */
 		vm_page_busy(m);
 	}
 
@@ -2212,9 +2236,7 @@ vm_map_split(entry)
 		 * and destroy unneeded pages in
 		 * shadow object.
 		 */
-		swap_pager_copy(orig_object, OFF_TO_IDX(orig_object->paging_offset),
-		    new_object, OFF_TO_IDX(new_object->paging_offset),
-			offidxstart, 0);
+		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		vm_object_pip_wakeup(orig_object);
 	}
 
@@ -2670,8 +2692,13 @@ RetryLookup:;
 		vm_map_lock_downgrade(share_map);
 	}
 
+#if 0
+	/*
+	 * (no longer applies)
+	 */
 	if (entry->object.vm_object->type == OBJT_DEFAULT)
 		default_pager_convert_to_swapq(entry->object.vm_object);
+#endif
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
@@ -2781,6 +2808,10 @@ vm_uiomove(mapa, srcobject, cp, cnta, uaddra, npages)
 					vm_map_lookup_done(map, entry);
 					return 0;
 				}
+				/*
+				 * disallow busy or invalid pages, but allow
+				 * m->busy pages if they are entirely valid.
+				 */
 				if ((m->flags & PG_BUSY) ||
 					((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
 					vm_map_lookup_done(map, entry);
@@ -2856,7 +2887,7 @@ vm_uiomove(mapa, srcobject, cp, cnta, uaddra, npages)
 				 */
 				if (first_object->type == OBJT_SWAP) {
 					swap_pager_freespace(first_object,
-						OFF_TO_IDX(first_object->paging_offset),
+						0,
 						first_object->size);
 				}
 
diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index 5bc74bd..bb52f66 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_meter.c	8.4 (Berkeley) 1/4/94
- * $Id: vm_meter.c,v 1.26 1998/08/24 08:39:37 dfr Exp $
+ * $Id: vm_meter.c,v 1.27 1998/10/31 17:21:31 peter Exp $
  */
 
 #include <sys/param.h>
@@ -195,6 +195,11 @@ vmtotal SYSCTL_HANDLER_ARGS
 	for (object = TAILQ_FIRST(&vm_object_list);
 	    object != NULL;
 	    object = TAILQ_NEXT(object, object_list)) {
+		/*
+		 * devices, like /dev/mem, will badly skew our totals
+		 */
+		if (object->type == OBJT_DEVICE)
+			continue;
 		totalp->t_vm += object->size;
 		totalp->t_rm += object->resident_page_count;
 		if (object->flags & OBJ_ACTIVE) {
diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c
index ba36e41..1374dfb 100644
--- a/sys/vm/vm_mmap.c
+++ b/sys/vm/vm_mmap.c
@@ -38,7 +38,7 @@
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
- * $Id: vm_mmap.c,v 1.85 1998/12/09 20:22:21 dt Exp $
+ * $Id: vm_mmap.c,v 1.86 1999/01/06 23:05:42 julian Exp $
  */
 
 /*
@@ -71,6 +71,7 @@
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index a1477f2..86c71c8 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_object.c,v 1.137 1999/01/08 17:31:26 eivind Exp $
+ * $Id: vm_object.c,v 1.138 1999/01/10 01:58:28 eivind Exp $
  */
 
 /*
@@ -134,9 +134,12 @@ static long object_bypasses;
 static int next_index;
 static vm_zone_t obj_zone;
 static struct vm_zone obj_zone_store;
+static int object_hash_rand;
 #define VM_OBJECTS_INIT 256
 static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
+#if 0
 static int objidnumber;
+#endif
 
 void
 _vm_object_allocate(type, size, object)
@@ -152,7 +155,9 @@ _vm_object_allocate(type, size, object)
 	object->size = size;
 	object->ref_count = 1;
 	object->flags = 0;
+#if 0
 	object->id = ++objidnumber;
+#endif
 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 		vm_object_set_flag(object, OBJ_ONEMAPPING);
 	object->behavior = OBJ_NORMAL;
@@ -168,16 +173,25 @@ _vm_object_allocate(type, size, object)
 		incr = size;
 	next_index = (next_index + incr) & PQ_L2_MASK;
 	object->handle = NULL;
-	object->paging_offset = (vm_ooffset_t) 0;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
+#if 0
 	object->page_hint = NULL;
+#endif
+	/*
+	 * Try to generate a number that will spread objects out in the
+	 * hash table.  We 'wipe' new objects across the hash in 128 page
+	 * increments plus 1 more to offset it a little more by the time
+	 * it wraps around.
+	 */
+	object->hash_rand = object_hash_rand - 129;
 
 	object->last_read = 0;
 	object->generation++;
 
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	vm_object_count++;
+	object_hash_rand = object->hash_rand;
 }
 
 /*
@@ -336,25 +350,15 @@ vm_object_deallocate(object)
 
 					robject->ref_count++;
 
-			retry:
-					if (robject->paging_in_progress ||
-							object->paging_in_progress) {
+					while (
+						robject->paging_in_progress ||
+						object->paging_in_progress
+					) {
 						vm_object_pip_sleep(robject, "objde1");
-						if (robject->paging_in_progress &&
-							robject->type == OBJT_SWAP) {
-							swap_pager_sync();
-							goto retry;
-						}
-
 						vm_object_pip_sleep(object, "objde2");
-						if (object->paging_in_progress &&
-							object->type == OBJT_SWAP) {
-							swap_pager_sync();
-						}
-						goto retry;
 					}
 
-					if( robject->ref_count == 1) {
+					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
@@ -396,6 +400,7 @@ doterm:
  *	up all previously used resources.
  *
  *	The object must be locked.
+ *	This routine may block.
  */
 void
 vm_object_terminate(object)
@@ -444,13 +449,13 @@ vm_object_terminate(object)
 	/*
 	 * Now free any remaining pages. For internal objects, this also
 	 * removes them from paging queues. Don't free wired pages, just
-	 * remove them from the object.
+	 * remove them from the object. 
 	 */
 	s = splvm();
 	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 #if !defined(MAX_PERF)
 		if (p->busy || (p->flags & PG_BUSY))
-			printf("vm_object_terminate: freeing busy page\n");
+			panic("vm_object_terminate: freeing busy page %p\n", p);
 #endif
 		if (p->wire_count == 0) {
 			vm_page_busy(p);
@@ -566,9 +571,7 @@ rescan:
 		}
 
 		s = splvm();
-		while ((p->flags & PG_BUSY) || p->busy) {
-			vm_page_flag_set(p, PG_WANTED | PG_REFERENCED);
-			tsleep(p, PVM, "vpcwai", 0);
+		while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
 			if (object->generation != curgeneration) {
 				splx(s);
 				goto rescan;
@@ -763,6 +766,12 @@ vm_object_pmap_remove(object, start, end)
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
+ *
+ *	Currently, madvise() functions are limited to the default and
+ *	swap object types only, and also limited to only the unshared portions 
+ *	of a process's address space.  MADV_FREE, certainly, could never be
+ *	run on anything else.  The others are more flexible and the code could
+ *	be adjusted in the future to handle expanded cases for them.
  */
 void
 vm_object_madvise(object, pindex, count, advise)
@@ -780,22 +789,59 @@ vm_object_madvise(object, pindex, count, advise)
 
 	end = pindex + count;
 
-	for (; pindex < end; pindex += 1) {
+	/*
+	 * MADV_FREE special case - free any swap backing store (as well
+	 * as resident pages later on).
+	 */
+
+	if (advise == MADV_FREE) {
+		tobject = object;
+		tpindex = pindex;
 
+		while (
+		    (tobject->type == OBJT_DEFAULT || 
+		     tobject->type == OBJT_SWAP) &&
+		    (tobject->flags & OBJ_ONEMAPPING)
+		) {
+			if (tobject->type == OBJT_SWAP) {
+				swap_pager_freespace(tobject, tpindex, count);
+			}
+			if ((tobject = tobject->backing_object) == NULL)
+				break;
+			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
+		}
+	}
+
+	/*
+	 * Locate and adjust resident pages
+	 */
+
+	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
+
+		if (tobject->type != OBJT_DEFAULT &&
+		    tobject->type != OBJT_SWAP
+		) {
+			continue;
+		}
+
+		if ((tobject->flags & OBJ_ONEMAPPING) == 0)
+			continue;
+
 		m = vm_page_lookup(tobject, tpindex);
+
 		if (m == NULL) {
-			if (tobject->type != OBJT_DEFAULT) {
-				continue;
-			}
-				
 			tobject = tobject->backing_object;
+			if (tobject == NULL)
+				continue;
+#if 0
 			if ((tobject == NULL) || (tobject->ref_count != 1)) {
 				continue;
 			}
+#endif
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			goto shadowlookup;
 		}
@@ -805,12 +851,15 @@ shadowlookup:
 		 * we skip it.  Things can break if we mess with pages
 		 * in any of the below states.
 		 */
-		if (m->hold_count || m->wire_count ||
-			m->valid != VM_PAGE_BITS_ALL) {
+		if (
+		    m->hold_count ||
+		    m->wire_count ||
+		    m->valid != VM_PAGE_BITS_ALL
+		) {
 			continue;
 		}
 
- 		if (vm_page_sleep(m, "madvpo", &m->busy))
+ 		if (vm_page_sleep_busy(m, TRUE, "madvpo"))
   			goto relookup;
 
 		if (advise == MADV_WILLNEED) {
@@ -818,15 +867,25 @@ shadowlookup:
 		} else if (advise == MADV_DONTNEED) {
 			vm_page_deactivate(m);
 		} else if (advise == MADV_FREE) {
-			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
-			m->dirty = 0;
 			/*
-			 * Force a demand zero if attempt to read from swap.
-			 * We currently don't handle vnode files correctly,
-			 * and will reread stale contents unnecessarily.
+			 * If MADV_FREE_FORCE_FREE is defined, we attempt to
+			 * immediately free the page.  Otherwise we just 
+			 * destroy any swap backing store, mark it clean,
+			 * and stuff it into the cache.
 			 */
-			if (object->type == OBJT_SWAP)
-				swap_pager_dmzspace(tobject, m->pindex, 1);
+			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+			m->dirty = 0;
+
+#ifdef MADV_FREE_FORCE_FREE
+			if (tobject->resident_page_count > 1) {
+				vm_page_busy(m);
+				vm_page_protect(m, VM_PROT_NONE);
+				vm_page_free(m);
+			} else
+#endif
+			{
+				vm_page_cache(m);
+			}
 		}
 	}	
 }
@@ -900,8 +959,7 @@ vm_object_qcollapse(object)
 	register vm_object_t object;
 {
 	register vm_object_t backing_object;
-	register vm_pindex_t backing_offset_index, paging_offset_index;
-	vm_pindex_t backing_object_paging_offset_index;
+	register vm_pindex_t backing_offset_index;
 	vm_pindex_t new_pindex;
 	register vm_page_t p, pp;
 	register vm_size_t size;
@@ -913,27 +971,39 @@ vm_object_qcollapse(object)
 	backing_object->ref_count += 2;
 
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
-	backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset);
-	paging_offset_index = OFF_TO_IDX(object->paging_offset);
 	size = object->size;
+
 	p = TAILQ_FIRST(&backing_object->memq);
 	while (p) {
 		vm_page_t next;
 
+		/*
+		 * setup for loop.
+		 * loop if the page isn't trivial.
+		 */
+
 		next = TAILQ_NEXT(p, listq);
 		if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) ||
 		    !p->valid || p->hold_count || p->wire_count || p->busy) {
 			p = next;
 			continue;
 		}
+
+		/*
+		 * busy the page and move it from the backing store to the
+		 * parent object.
+		 */
+
 		vm_page_busy(p);
 
+		KASSERT(p->object == object, ("vm_object_qcollapse(): object mismatch"));
+
 		new_pindex = p->pindex - backing_offset_index;
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= size) {
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object,
-				    backing_object_paging_offset_index+p->pindex,
+				    p->pindex,
 				    1);
 			vm_page_protect(p, VM_PROT_NONE);
 			vm_page_free(p);
@@ -941,16 +1011,16 @@ vm_object_qcollapse(object)
 			pp = vm_page_lookup(object, new_pindex);
 			if (pp != NULL ||
 				(object->type == OBJT_SWAP && vm_pager_has_page(object,
-				    paging_offset_index + new_pindex, NULL, NULL))) {
+				    new_pindex, NULL, NULL))) {
 				if (backing_object->type == OBJT_SWAP)
 					swap_pager_freespace(backing_object,
-					    backing_object_paging_offset_index + p->pindex, 1);
+					    p->pindex, 1);
 				vm_page_protect(p, VM_PROT_NONE);
 				vm_page_free(p);
 			} else {
 				if (backing_object->type == OBJT_SWAP)
 					swap_pager_freespace(backing_object,
-					    backing_object_paging_offset_index + p->pindex, 1);
+					    p->pindex, 1);
 
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
@@ -958,7 +1028,7 @@ vm_object_qcollapse(object)
 					vm_page_protect(p, VM_PROT_NONE);
 
 				vm_page_rename(p, object, new_pindex);
-				p->dirty = VM_PAGE_BITS_ALL;
+				/* page automatically made dirty by rename */
 			}
 		}
 		p = next;
@@ -1049,9 +1119,10 @@ vm_object_collapse(object)
 			 */
 
 			while ((p = TAILQ_FIRST(&backing_object->memq)) != 0) {
-
-				new_pindex = p->pindex - backing_offset_index;
+				if (vm_page_sleep_busy(p, TRUE, "vmocol"))
+					continue;
 				vm_page_busy(p);
+				new_pindex = p->pindex - backing_offset_index;
 
 				/*
 				 * If the parent has a page here, or if this
@@ -1068,7 +1139,7 @@ vm_object_collapse(object)
 				} else {
 					pp = vm_page_lookup(object, new_pindex);
 					if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
-					    OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) {
+					    new_pindex, NULL, NULL))) {
 						vm_page_protect(p, VM_PROT_NONE);
 						vm_page_free(p);
 					} else {
@@ -1077,7 +1148,7 @@ vm_object_collapse(object)
 						else
 							vm_page_protect(p, VM_PROT_NONE);
 						vm_page_rename(p, object, new_pindex);
-						p->dirty = VM_PAGE_BITS_ALL;
+						/* page automatically made dirty by rename */
 					}
 				}
 			}
@@ -1088,52 +1159,22 @@ vm_object_collapse(object)
 
 			if (backing_object->type == OBJT_SWAP) {
 				vm_object_pip_add(backing_object, 1);
-				if (object->type == OBJT_SWAP) {
-					vm_object_pip_add(object, 1);
-					/*
-					 * copy shadow object pages into ours
-					 * and destroy unneeded pages in
-					 * shadow object.
-					 */
-					swap_pager_copy(
-					    backing_object,
-					    OFF_TO_IDX(backing_object->paging_offset),
-					    object,
-					    OFF_TO_IDX(object->paging_offset),
-					    OFF_TO_IDX(object->backing_object_offset), TRUE);
-					vm_object_pip_wakeup(object);
-				} else {
-					vm_object_pip_add(object, 1);
-					/*
-					 * move the shadow backing_object's pager data to
-					 * "object" and convert "object" type to OBJT_SWAP.
-					 */
-					object->type = OBJT_SWAP;
-					object->un_pager.swp.swp_nblocks =
-					    backing_object->un_pager.swp.swp_nblocks;
-					object->un_pager.swp.swp_allocsize =
-					    backing_object->un_pager.swp.swp_allocsize;
-					object->un_pager.swp.swp_blocks =
-					    backing_object->un_pager.swp.swp_blocks;
-					object->un_pager.swp.swp_poip =		/* XXX */
-					    backing_object->un_pager.swp.swp_poip;
-					object->paging_offset = backing_object->paging_offset + backing_offset;
-					TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
-
-					/*
-					 * Convert backing object from OBJT_SWAP to
-					 * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is
-					 * actually necessary.
-					 */
-					backing_object->type = OBJT_DEFAULT;
-					TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list);
-					/*
-					 * free unnecessary blocks
-					 */
-					swap_pager_freespace(object, 0,
-						OFF_TO_IDX(object->paging_offset));
-					vm_object_pip_wakeup(object);
-				}
+
+				/*
+				 * scrap the paging_offset junk and do a 
+				 * discrete copy.  This also removes major 
+				 * assumptions about how the swap-pager 
+				 * works from where it doesn't belong.  The
+				 * new swapper is able to optimize the
+				 * destroy-source case.
+				 */
+
+				vm_object_pip_add(object, 1);
+				swap_pager_copy(
+				    backing_object,
+				    object,
+				    OFF_TO_IDX(object->backing_object_offset), TRUE);
+				vm_object_pip_wakeup(object);
 
 				vm_object_pip_wakeup(backing_object);
 			}
@@ -1223,7 +1264,7 @@ vm_object_collapse(object)
 
 					vm_page_busy(pp);
 					if ((pp->valid == 0) &&
-				   	    !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) {
+				   	    !vm_pager_has_page(object, new_pindex, NULL, NULL)) {
 						/*
 						 * Page still needed. Can't go any
 						 * further.
@@ -1318,7 +1359,7 @@ again:
 				 * interrupt -- minimize the spl transitions
 				 */
 
- 				if (vm_page_sleep(p, "vmopar", &p->busy))
+ 				if (vm_page_sleep_busy(p, TRUE, "vmopar"))
  					goto again;
 
 				if (clean_only && p->valid) {
@@ -1349,7 +1390,7 @@ again:
 				 * The busy flags are only cleared at
 				 * interrupt -- minimize the spl transitions
 				 */
- 				if (vm_page_sleep(p, "vmopar", &p->busy))
+ 				if (vm_page_sleep_busy(p, TRUE, "vmopar"))
 					goto again;
 
 				if (clean_only && p->valid) {
@@ -1589,11 +1630,10 @@ DB_SHOW_COMMAND(object, vm_object_print_static)
 	    object, (int)object->type, (u_long)object->size,
 	    object->resident_page_count, object->ref_count, object->flags);
 	/*
-	 * XXX no %qd in kernel.  Truncate object->paging_offset and
-	 * object->backing_object_offset.
+	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
 	 */
-	db_iprintf(" sref=%d, offset=0x%lx, backing_object(%d)=(%p)+0x%lx\n",
-	    object->shadow_count, (long)object->paging_offset,
+	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
+	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (long)object->backing_object_offset);
 
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 9897393..7f54ab6 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_object.h,v 1.50 1998/08/06 08:33:19 dfr Exp $
+ * $Id: vm_object.h,v 1.51 1998/08/24 08:39:37 dfr Exp $
  */
 
 /*
@@ -81,6 +81,7 @@ typedef enum obj_type objtype_t;
  *	Types defined:
  *
  *	vm_object_t		Virtual memory object.
+ *
  */
 
 struct vm_object {
@@ -94,32 +95,49 @@ struct vm_object {
 	int ref_count;			/* How many refs?? */
 	int shadow_count;		/* how many objects that this is a shadow for */
 	int pg_color;			/* color of first page in obj */
-	int	id;					/* ID for no purpose, other than info */
+#if 0
+	int id;				/* ID for no purpose, other than info */
+#endif
+	int hash_rand;			/* vm hash table randomizer	*/
 	u_short flags;			/* see below */
 	u_short paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	u_short	behavior;		/* see below */
 	int resident_page_count;	/* number of resident pages */
-	int cache_count;			/* number of cached pages */
-	int	wire_count;			/* number of wired pages */
-	vm_ooffset_t paging_offset;	/* Offset into paging space */
+	int cache_count;		/* number of cached pages */
+	int	wire_count;		/* number of wired pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	vm_offset_t last_read;		/* last read in object -- detect seq behavior */
-	vm_page_t page_hint;		/* hint for last looked-up or allocated page */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	void *handle;
 	union {
+		/*
+		 * VNode pager
+		 *
+		 *	vnp_size - current size of file
+		 */
 		struct {
-			off_t vnp_size; /* Current size of file */
+			off_t vnp_size;
 		} vnp;
+
+		/*
+		 * Device pager
+		 *
+		 *	devp_pglist - list of allocated pages
+		 */
 		struct {
-			TAILQ_HEAD(, vm_page) devp_pglist; /* list of pages allocated */
+			TAILQ_HEAD(, vm_page) devp_pglist;
 		} devp;
+
+		/*
+		 * Swap pager
+		 *
+		 *	swp_bcount - number of swap 'swblock' metablocks, each
+		 *		     contains up to 16 swapblk assignments.
+		 *		     see vm/swap_pager.h
+		 */
 		struct {
-			int swp_nblocks;
-			int swp_allocsize;
-			struct swblock *swp_blocks;
-			short swp_poip;
+			int swp_bcount;
 		} swp;
 	} un_pager;
 };
@@ -132,7 +150,7 @@ struct vm_object {
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
 #define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
 #define	OBJ_WRITEABLE	0x0080		/* object has been made writable */
-#define OBJ_MIGHTBEDIRTY	0x0100	/* object might be dirty */
+#define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty */
 #define OBJ_CLEANING	0x0200
 #define OBJ_OPT		0x1000		/* I/O optimization */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
@@ -197,12 +215,21 @@ vm_object_pip_wakeup(vm_object_t object)
 }
 
 static __inline void
-vm_object_pip_sleep(vm_object_t object, char *waitid)
+vm_object_pip_wakeupn(vm_object_t object, int i)
 {
-	int s;
+	if (i)
+		atomic_subtract_short(&object->paging_in_progress, i);
+	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
+		vm_object_clear_flag(object, OBJ_PIPWNT);
+		wakeup(object);
+	}
+}
 
+static __inline void
+vm_object_pip_sleep(vm_object_t object, char *waitid)
+{
 	if (object->paging_in_progress) {
-		s = splvm();
+		int s = splvm();
 		if (object->paging_in_progress) {
 			vm_object_set_flag(object, OBJ_PIPWNT);
 			tsleep(object, PVM, waitid, 0);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index c953559..2f0f4bd 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
- *	$Id: vm_page.c,v 1.115 1999/01/08 17:31:27 eivind Exp $
+ *	$Id: vm_page.c,v 1.116 1999/01/10 01:58:29 eivind Exp $
  */
 
 /*
@@ -83,6 +83,7 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 static void	vm_page_queue_init __P((void));
@@ -95,7 +96,7 @@ static vm_page_t vm_page_select_cache __P((vm_object_t, vm_pindex_t));
  *	page structure.
  */
 
-static struct pglist *vm_page_buckets;	/* Array of buckets */
+static struct vm_page **vm_page_buckets; /* Array of buckets */
 static int vm_page_bucket_count;	/* How big is array? */
 static int vm_page_hash_mask;		/* Mask for hash function */
 static volatile int vm_page_bucket_generation;
@@ -162,7 +163,6 @@ static u_short vm_page_dev_bsize_chunks[] = {
 };
 
 static __inline int vm_page_hash __P((vm_object_t object, vm_pindex_t pindex));
-static int vm_page_freechk_and_unqueue __P((vm_page_t m));
 static void vm_page_free_wakeup __P((void));
 
 /*
@@ -206,7 +206,7 @@ vm_page_startup(starta, enda, vaddr)
 {
 	register vm_offset_t mapped;
 	register vm_page_t m;
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 	vm_size_t npages, page_range;
 	register vm_offset_t new_start;
 	int i;
@@ -256,24 +256,30 @@ vm_page_startup(starta, enda, vaddr)
 	 *
 	 * The number of buckets MUST BE a power of 2, and the actual value is
 	 * the next power of 2 greater than the number of physical pages in
-	 * the system.
+	 * the system.  
+	 *
+	 * We make the hash table approximately 2x the number of pages to
+	 * reduce the chain length.  This is about the same size using the 
+	 * singly-linked list as the 1x hash table we were using before 
+	 * using TAILQ but the chain length will be smaller.
 	 *
 	 * Note: This computation can be tweaked if desired.
 	 */
-	vm_page_buckets = (struct pglist *) vaddr;
+	vm_page_buckets = (struct vm_page **)vaddr;
 	bucket = vm_page_buckets;
 	if (vm_page_bucket_count == 0) {
 		vm_page_bucket_count = 1;
 		while (vm_page_bucket_count < atop(total))
 			vm_page_bucket_count <<= 1;
 	}
+	vm_page_bucket_count <<= 1;
 	vm_page_hash_mask = vm_page_bucket_count - 1;
 
 	/*
 	 * Validate these addresses.
 	 */
 
-	new_start = start + vm_page_bucket_count * sizeof(struct pglist);
+	new_start = start + vm_page_bucket_count * sizeof(struct vm_page *);
 	new_start = round_page(new_start);
 	mapped = round_page(vaddr);
 	vaddr = pmap_map(mapped, start, new_start,
@@ -283,7 +289,7 @@ vm_page_startup(starta, enda, vaddr)
 	bzero((caddr_t) mapped, vaddr - mapped);
 
 	for (i = 0; i < vm_page_bucket_count; i++) {
-		TAILQ_INIT(bucket);
+		*bucket = NULL;
 		bucket++;
 	}
 
@@ -353,13 +359,18 @@ vm_page_startup(starta, enda, vaddr)
  *
  *	NOTE:  This macro depends on vm_page_bucket_count being a power of 2.
  *	This routine may not block.
+ *
+ *	We try to randomize the hash based on the object to spread the pages
+ *	out in the hash table without it costing us too much.
  */
 static __inline int
 vm_page_hash(object, pindex)
 	vm_object_t object;
 	vm_pindex_t pindex;
 {
-	return ((((uintptr_t) object) >> 5) + (pindex >> 1)) & vm_page_hash_mask;
+	int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
+
+	return(i & vm_page_hash_mask);
 }
 
 /*
@@ -382,7 +393,7 @@ vm_page_insert(m, object, pindex)
 	register vm_object_t object;
 	register vm_pindex_t pindex;
 {
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 
 	if (m->object != NULL)
 		panic("vm_page_insert: already inserted");
@@ -399,7 +410,8 @@ vm_page_insert(m, object, pindex)
 	 */
 
 	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
-	TAILQ_INSERT_TAIL(bucket, m, hashq);
+	m->hnext = *bucket;
+	*bucket = m;
 	vm_page_bucket_generation++;
 
 	/*
@@ -407,7 +419,9 @@ vm_page_insert(m, object, pindex)
 	 */
 
 	TAILQ_INSERT_TAIL(&object->memq, m, listq);
+#if 0
 	m->object->page_hint = m;
+#endif
 	m->object->generation++;
 
 	if (m->wire_count)
@@ -417,50 +431,48 @@ vm_page_insert(m, object, pindex)
 		object->cache_count++;
 
 	/*
-	 * And show that the object has one more resident page.
+	 * show that the object has one more resident page.
 	 */
 
 	object->resident_page_count++;
 }
 
 /*
- *	vm_page_remove:		[ internal use only ]
+ *	vm_page_remove:
  *				NOTE: used by device pager as well -wfj
  *
  *	Removes the given mem entry from the object/offset-page
- *	table and the object page list.
+ *	table and the object page list, but do not invalidate/terminate
+ *	the backing store.
  *
  *	The object and page must be locked, and at splhigh.
+ *	The underlying pmap entry (if any) is NOT removed here.
  *	This routine may not block.
- *
- *	I do not think the underlying pmap entry (if any) is removed here.
  */
 
-void
+vm_object_t
 vm_page_remove(m)
-	register vm_page_t m;
+	vm_page_t m;
 {
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 	vm_object_t object;
 
 	if (m->object == NULL)
-		return;
+		return(NULL);
 
 #if !defined(MAX_PERF)
 	if ((m->flags & PG_BUSY) == 0) {
 		panic("vm_page_remove: page not busy");
 	}
 #endif
-	
-	vm_page_flag_clear(m, PG_BUSY);
-	if (m->flags & PG_WANTED) {
-		vm_page_flag_clear(m, PG_WANTED);
-		wakeup(m);
-	}
+
+	/*
+	 * Basically destroy the page.
+	 */
+
+	vm_page_wakeup(m);
 
 	object = m->object;
-	if (object->page_hint == m)
-		object->page_hint = NULL;
 
 	if (m->wire_count)
 		object->wire_count--;
@@ -469,11 +481,23 @@ vm_page_remove(m)
 		object->cache_count--;
 
 	/*
-	 * Remove from the object_object/offset hash table
+	 * Remove from the object_object/offset hash table.  The object
+	 * must be on the hash queue, we will panic if it isn't
+	 *
+	 * Note: we must NULL-out m->hnext to prevent loops in detached
+	 * buffers with vm_page_lookup().
 	 */
 
 	bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
-	TAILQ_REMOVE(bucket, m, hashq);
+	while (*bucket != m) {
+#if !defined(MAX_PERF)
+		if (*bucket == NULL)
+			panic("vm_page_remove(): page not found in hash");
+#endif
+		bucket = &(*bucket)->hnext;
+	}
+	*bucket = m->hnext;
+	m->hnext = NULL;
 	vm_page_bucket_generation++;
 
 	/*
@@ -490,6 +514,8 @@ vm_page_remove(m)
 	object->generation++;
 
 	m->object = NULL;
+
+	return(object);
 }
 
 /*
@@ -498,8 +524,14 @@ vm_page_remove(m)
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
+ *	NOTE: the code below does not lock.  It will operate properly if
+ *	an interrupt makes a change, but the generation algorithm will not 
+ *	operate properly in an SMP environment where both cpu's are able to run
+ *	kernel code simultaniously.
+ *
  *	The object must be locked.  No side effects.
  *	This routine may not block.
+ *	This is a critical path routine
  */
 
 vm_page_t
@@ -508,25 +540,29 @@ vm_page_lookup(object, pindex)
 	register vm_pindex_t pindex;
 {
 	register vm_page_t m;
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 	int generation;
 
 	/*
 	 * Search the hash table for this object/offset pair
 	 */
 
+#if 0
 	if (object->page_hint && (object->page_hint->pindex == pindex) &&
 		(object->page_hint->object == object))
 		return object->page_hint;
+#endif
 
 retry:
 	generation = vm_page_bucket_generation;
 	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
-	for (m = TAILQ_FIRST(bucket); m != NULL; m = TAILQ_NEXT(m,hashq)) {
+	for (m = *bucket; m != NULL; m = m->hnext) {
 		if ((m->object == object) && (m->pindex == pindex)) {
 			if (vm_page_bucket_generation != generation)
 				goto retry;
+#if 0
 			m->object->page_hint = m;
+#endif
 			return (m);
 		}
 	}
@@ -545,6 +581,16 @@ retry:
  *	This routine may not block.
  *
  *	Note: this routine will raise itself to splvm(), the caller need not. 
+ *
+ *	Note: swap associated with the page must be invalidated by the move.  We
+ *	      have to do this for several reasons:  (1) we aren't freeing the
+ *	      page, (2) we are dirtying the page, (3) the VM system is probably
+ *	      moving the page from object A to B, and will then later move
+ *	      the backing store from A to B and we can't have a conflict.
+ *
+ *	Note: we *always* dirty the page.  It is necessary both for the
+ *	      fact that we moved it, and because we may be invalidating
+ *	      swap.
  */
 
 void
@@ -558,6 +604,7 @@ vm_page_rename(m, new_object, new_pindex)
 	s = splvm();
 	vm_page_remove(m);
 	vm_page_insert(m, new_object, new_pindex);
+	m->dirty = VM_PAGE_BITS_ALL;
 	splx(s);
 }
 
@@ -625,6 +672,12 @@ vm_page_unqueue(m)
  *
  *	Find a page on the specified queue with color optimization.
  *
+ *	The page coloring optimization attempts to locate a page
+ *	that does not overload other nearby pages in the object in
+ *	the cpu's L1 or L2 caches.  We need this optmization because 
+ *	cpu caches tend to be physical caches, while object spaces tend 
+ *	to be virtual.
+ *
  *	This routine must be called at splvm().
  *	This routine may not block.
  */
@@ -759,7 +812,10 @@ vm_page_select_free(object, pindex, prefqueue)
 	int i,j;
 	int index, hindex;
 #endif
-	vm_page_t m, mh;
+	vm_page_t m;
+#if 0
+	vm_page_t mh;
+#endif
 	int oqueuediff;
 	struct vpgqueues *pq;
 
@@ -768,6 +824,7 @@ vm_page_select_free(object, pindex, prefqueue)
 	else
 		oqueuediff = PQ_ZERO - PQ_FREE;
 
+#if 0
 	if (mh = object->page_hint) {
 		 if (mh->pindex == (pindex - 1)) {
 			if ((mh->flags & PG_FICTITIOUS) == 0) {
@@ -785,6 +842,7 @@ vm_page_select_free(object, pindex, prefqueue)
 			}
 		}
 	}
+#endif
 
 	pq = &vm_page_queues[prefqueue];
 
@@ -857,6 +915,8 @@ vm_page_select_free(object, pindex, prefqueue)
  *	Additional special handling is required when called from an
  *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
  *	the page cache in this case.
+ *
+ *	vm_page_alloc() 
  */
 vm_page_t
 vm_page_alloc(object, pindex, page_req)
@@ -864,7 +924,7 @@ vm_page_alloc(object, pindex, page_req)
 	vm_pindex_t pindex;
 	int page_req;
 {
-	register vm_page_t m;
+	register vm_page_t m = NULL;
 	struct vpgqueues *pq;
 	vm_object_t oldobject;
 	int queue, qtype;
@@ -873,12 +933,17 @@ vm_page_alloc(object, pindex, page_req)
 	KASSERT(!vm_page_lookup(object, pindex),
 		("vm_page_alloc: page already allocated"));
 
+	/*
+	 * The pager is allowed to eat deeper into the free page list.
+	 */
+
 	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
 		page_req = VM_ALLOC_SYSTEM;
 	};
 
 	s = splvm();
 
+loop:
 	switch (page_req) {
 
 	case VM_ALLOC_NORMAL:
@@ -961,20 +1026,36 @@ vm_page_alloc(object, pindex, page_req)
 
 	queue = m->queue;
 	qtype = queue - m->pc;
-	if (qtype == PQ_ZERO)
-		vm_page_zero_count--;
+
+	/*
+	 * Cache pages must be formally freed (and doubly so with the
+	 * new pagerops functions).  We free the page and try again.
+	 *
+	 * This also has the side effect of ensuring that the minfreepage 
+	 * wall is held more tightly verses the old code.
+	 */
+
+	if (qtype == PQ_CACHE) {
+#if !defined(MAX_PERF)
+		if (m->dirty)
+			panic("found dirty cache page %p", m);
+			
+#endif
+		vm_page_busy(m);
+		vm_page_protect(m, VM_PROT_NONE);
+		vm_page_free(m);
+		goto loop;
+	}
+
 	pq = &vm_page_queues[queue];
 	TAILQ_REMOVE(pq->pl, m, pageq);
 	(*pq->cnt)--;
 	(*pq->lcnt)--;
 	oldobject = NULL;
+
 	if (qtype == PQ_ZERO) {
+		vm_page_zero_count--;
 		m->flags = PG_ZERO | PG_BUSY;
-	} else if (qtype == PQ_CACHE) {
-		oldobject = m->object;
-		vm_page_busy(m);
-		vm_page_remove(m);
-		m->flags = PG_BUSY;
 	} else {
 		m->flags = PG_BUSY;
 	}
@@ -1004,6 +1085,12 @@ vm_page_alloc(object, pindex, page_req)
 			(cnt.v_free_count < cnt.v_pageout_free_min))
 		pagedaemon_wakeup();
 
+#if 0
+	/*
+	 * (code removed - was previously a manual breakout of the act of
+	 * freeing a page from cache.  We now just call vm_page_free() on
+	 * a cache page an loop so this code no longer needs to be here)
+	 */
 	if ((qtype == PQ_CACHE) &&
 		((page_req == VM_ALLOC_NORMAL) || (page_req == VM_ALLOC_ZERO)) &&
 		oldobject && (oldobject->type == OBJT_VNODE) &&
@@ -1017,6 +1104,7 @@ vm_page_alloc(object, pindex, page_req)
 			}
 		}
 	}
+#endif
 	splx(s);
 
 	return (m);
@@ -1048,6 +1136,33 @@ vm_wait()
 }
 
 /*
+ *	vm_await:	(also see VM_AWAIT macro)
+ *
+ *	asleep on an event that will signal when free pages are available
+ *	for allocation.
+ */
+
+void
+vm_await()
+{
+	int s;
+
+	s = splvm();
+	if (curproc == pageproc) {
+		vm_pageout_pages_needed = 1;
+		asleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
+	} else {
+		if (!vm_pages_needed) {
+			vm_pages_needed++;
+			wakeup(&vm_pages_needed);
+		}
+		asleep(&cnt.v_free_count, PVM, "vmwait", 0);
+	}
+	splx(s);
+}
+
+#if 0
+/*
  *	vm_page_sleep:
  *
  *	Block until page is no longer busy.
@@ -1069,6 +1184,38 @@ vm_page_sleep(vm_page_t m, char *msg, char *busy) {
 	return slept;
 }
 
+#endif
+
+#if 0
+
+/*
+ *	vm_page_asleep:
+ *
+ *	Similar to vm_page_sleep(), but does not block.  Returns 0 if
+ *	the page is not busy, or 1 if the page is busy.
+ *
+ *	This routine has the side effect of calling asleep() if the page
+ *	was busy (1 returned).
+ */
+
+int
+vm_page_asleep(vm_page_t m, char *msg, char *busy) {
+	int slept = 0;
+	if ((busy && *busy) || (m->flags & PG_BUSY)) {
+		int s;
+		s = splvm();
+		if ((busy && *busy) || (m->flags & PG_BUSY)) {
+			vm_page_flag_set(m, PG_WANTED);
+			asleep(m, PVM, msg, 0);
+			slept = 1;
+		}
+		splx(s);
+	}
+	return slept;
+}
+
+#endif
+
 /*
  *	vm_page_activate:
  *
@@ -1111,13 +1258,49 @@ vm_page_activate(m)
  *
  * This routine may not block.
  */
-static int
-vm_page_freechk_and_unqueue(m)
-	vm_page_t m;
+static __inline void
+vm_page_free_wakeup()
 {
-	vm_object_t oldobject;
+	/*
+	 * if pageout daemon needs pages, then tell it that there are
+	 * some free.
+	 */
+	if (vm_pageout_pages_needed) {
+		wakeup(&vm_pageout_pages_needed);
+		vm_pageout_pages_needed = 0;
+	}
+	/*
+	 * wakeup processes that are waiting on memory if we hit a
+	 * high water mark. And wakeup scheduler process if we have
+	 * lots of memory. this process will swapin processes.
+	 */
+	if (vm_pages_needed &&
+		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
+		wakeup(&cnt.v_free_count);
+		vm_pages_needed = 0;
+	}
+}
 
-	oldobject = m->object;
+/*
+ *	vm_page_free_toq:
+ *
+ *	Returns the given page to the PQ_FREE or PQ_ZERO list,
+ *	disassociating it with any VM object.
+ *
+ *	Object and page must be locked prior to entry.
+ *	This routine may not block.
+ */
+
+void
+vm_page_free_toq(vm_page_t m, int queue)
+{
+	int s;
+	struct vpgqueues *pq;
+	vm_object_t object = m->object;
+
+	s = splvm();
+
+	cnt.v_tfree++;
 
 #if !defined(MAX_PERF)
 	if (m->busy || ((m->queue - m->pc) == PQ_FREE) ||
@@ -1133,11 +1316,24 @@ vm_page_freechk_and_unqueue(m)
 	}
 #endif
 
+	/*
+	 * unqueue, then remove page.  Note that we cannot destroy
+	 * the page here because we do not want to call the pager's
+	 * callback routine until after we've put the page on the
+	 * appropriate free queue.
+	 */
+
 	vm_page_unqueue_nowakeup(m);
 	vm_page_remove(m);
 
+	/*
+	 * If fictitious remove object association and
+	 * return, otherwise delay object association removal.
+	 */
+
 	if ((m->flags & PG_FICTITIOUS) != 0) {
-		return 0;
+		splx(s);
+		return;
 	}
 
 	m->valid = 0;
@@ -1156,10 +1352,17 @@ vm_page_freechk_and_unqueue(m)
 		cnt.v_wire_count--;
 	}
 
-	if (oldobject && (oldobject->type == OBJT_VNODE) &&
-		((oldobject->flags & OBJ_DEAD) == 0)) {
-		struct vnode *vp;
-		vp = (struct vnode *) oldobject->handle;
+	/*
+	 * If we've exhausted the object's resident pages we want to free
+	 * it up.
+	 */
+
+	if (object && 
+	    (object->type == OBJT_VNODE) &&
+	    ((object->flags & OBJ_DEAD) == 0)
+	) {
+		struct vnode *vp = (struct vnode *)object->handle;
+
 		if (vp && VSHOULDFREE(vp)) {
 			if ((vp->v_flag & (VTBFREE|VDOOMED|VFREE)) == 0) {
 				TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist);
@@ -1172,107 +1375,31 @@ vm_page_freechk_and_unqueue(m)
 	pmap_page_is_free(m);
 #endif
 
-	return 1;
-}
-
-/*
- * helper routine for vm_page_free and vm_page_free_zero.
- *
- * This routine may not block.
- */
-static __inline void
-vm_page_free_wakeup()
-{
-	
-/*
- * if pageout daemon needs pages, then tell it that there are
- * some free.
- */
-	if (vm_pageout_pages_needed) {
-		wakeup(&vm_pageout_pages_needed);
-		vm_pageout_pages_needed = 0;
-	}
-	/*
-	 * wakeup processes that are waiting on memory if we hit a
-	 * high water mark. And wakeup scheduler process if we have
-	 * lots of memory. this process will swapin processes.
-	 */
-	if (vm_pages_needed &&
-		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
-		wakeup(&cnt.v_free_count);
-		vm_pages_needed = 0;
-	}
-}
-
-/*
- *	vm_page_free:
- *
- *	Returns the given page to the free list,
- *	disassociating it with any VM object.
- *
- *	Object and page must be locked prior to entry.
- *	This routine may not block.
- */
-void
-vm_page_free(m)
-	register vm_page_t m;
-{
-	int s;
-	struct vpgqueues *pq;
-
-	s = splvm();
-
-	cnt.v_tfree++;
-
-	if (!vm_page_freechk_and_unqueue(m)) {
-		splx(s);
-		return;
-	}
-
-	m->queue = PQ_FREE + m->pc;
+	m->queue = queue + m->pc;
 	pq = &vm_page_queues[m->queue];
 	++(*pq->lcnt);
 	++(*pq->cnt);
-	/*
-	 * If the pageout process is grabbing the page, it is likely
-	 * that the page is NOT in the cache.  It is more likely that
-	 * the page will be partially in the cache if it is being
-	 * explicitly freed.
-	 */
-	if (curproc == pageproc) {
-		TAILQ_INSERT_TAIL(pq->pl, m, pageq);
-	} else {
-		TAILQ_INSERT_HEAD(pq->pl, m, pageq);
-	}
 
-	vm_page_free_wakeup();
-	splx(s);
-}
-
-void
-vm_page_free_zero(m)
-	register vm_page_t m;
-{
-	int s;
-	struct vpgqueues *pq;
-
-	s = splvm();
-
-	cnt.v_tfree++;
+	if (queue == PQ_ZERO) {
+		TAILQ_INSERT_HEAD(pq->pl, m, pageq);
+		++vm_page_zero_count;
+	} else {
+		/*
+		 * If the pageout process is grabbing the page, it is likely
+		 * that the page is NOT in the cache.  It is more likely that
+		 * the page will be partially in the cache if it is being
+		 * explicitly freed.
+		 */
 
-	if (!vm_page_freechk_and_unqueue(m)) {
-		splx(s);
-		return;
+		if (curproc == pageproc) {
+			TAILQ_INSERT_TAIL(pq->pl, m, pageq);
+		} else {
+			TAILQ_INSERT_HEAD(pq->pl, m, pageq);
+		}
 	}
 
-	m->queue = PQ_ZERO + m->pc;
-	pq = &vm_page_queues[m->queue];
-	++(*pq->lcnt);
-	++(*pq->cnt);
-
-	TAILQ_INSERT_HEAD(pq->pl, m, pageq);
-	++vm_page_zero_count;
 	vm_page_free_wakeup();
+
 	splx(s);
 }
 
@@ -1311,6 +1438,17 @@ vm_page_wire(m)
  *	Release one wiring of this page, potentially
  *	enabling it to be paged again.
  *
+ *	Many pages placed on the inactive queue should actually go
+ *	into the cache, but it is difficult to figure out which.  What
+ *	we do instead, if the inactive target is well met, is to put
+ *	clean pages at the head of the inactive queue instead of the tail.
+ *	This will cause them to be moved to the cache more quickly and
+ *	if not actively re-referenced, freed more quickly.  If we just
+ *	stick these pages at the end of the inactive queue, heavy filesystem
+ *	meta-data accesses can cause an unnecessary paging load on memory bound 
+ *	processes.  This optimization causes one-time-use metadata to be
+ *	reused more quickly.
+ *
  *	The page queues must be locked.
  *	This routine may not block.
  */
@@ -1351,7 +1489,8 @@ vm_page_unwire(m, activate)
 
 
 /*
- * Move the specified page to the inactive queue.
+ * Move the specified page to the inactive queue.  If the page has
+ * any associated swap, the swap is deallocated.
  *
  * This routine may not block.
  */
@@ -1383,7 +1522,8 @@ vm_page_deactivate(m)
 /*
  * vm_page_cache
  *
- * Put the specified page onto the page cache queue (if appropriate). 
+ * Put the specified page onto the page cache queue (if appropriate).
+ *
  * This routine may not block.
  */
 void
@@ -1624,7 +1764,7 @@ again1:
 				}
 
 				next = TAILQ_NEXT(m, pageq);
-				if (vm_page_sleep(m, "vpctw0", &m->busy))
+				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
 					goto again1;
 				vm_page_test_dirty(m);
 				if (m->dirty) {
@@ -1652,7 +1792,7 @@ again1:
 				}
 
 				next = TAILQ_NEXT(m, pageq);
-				if (vm_page_sleep(m, "vpctw1", &m->busy))
+				if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
 					goto again1;
 				vm_page_test_dirty(m);
 				if (m->dirty) {
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 3149391..f9e4926 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_page.h,v 1.48 1998/10/28 13:37:02 dg Exp $
+ * $Id: vm_page.h,v 1.49 1999/01/08 17:31:28 eivind Exp $
  */
 
 /*
@@ -105,10 +105,10 @@ TAILQ_HEAD(pglist, vm_page);
 
 struct vm_page {
 	TAILQ_ENTRY(vm_page) pageq;	/* queue info for FIFO queue or free list (P) */
-	TAILQ_ENTRY(vm_page) hashq;	/* hash table links (O) */
-	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) */
+	struct vm_page	*hnext;		/* hash table link (O,P)	*/
+	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) 	*/
 
-	vm_object_t object;		/* which object am I in (O,P) */
+	vm_object_t object;		/* which object am I in (O,P)*/
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_offset_t phys_addr;		/* physical address of page */
 	u_short	queue;			/* page queue index */
@@ -130,6 +130,13 @@ struct vm_page {
 };
 
 /*
+ * note SWAPBLK_NONE is a flag, basically the high bit.
+ */
+
+#define SWAPBLK_MASK	((daddr_t)((u_daddr_t)-1 >> 1))		/* mask */
+#define SWAPBLK_NONE	((daddr_t)((u_daddr_t)SWAPBLK_MASK + 1))/* flag */
+
+/*
  * Page coloring parameters
  */
 /* Each of PQ_FREE, PQ_ZERO and PQ_CACHE have PQ_HASH_SIZE entries */
@@ -201,14 +208,15 @@ extern struct vpgqueues {
  *
  * Note: PG_FILLED and PG_DIRTY are added for the filesystems.
  */
-#define	PG_BUSY		0x01		/* page is in transit (O) */
-#define	PG_WANTED	0x02		/* someone is waiting for page (O) */
-#define	PG_FICTITIOUS	0x08		/* physical page doesn't exist (O) */
-#define	PG_WRITEABLE	0x10		/* page is mapped writeable */
-#define PG_MAPPED	0x20		/* page is mapped */
-#define	PG_ZERO		0x40		/* page is zeroed */
-#define PG_REFERENCED	0x80		/* page has been referenced */
-#define PG_CLEANCHK	0x100		/* page will be checked for cleaning */
+#define	PG_BUSY		0x0001		/* page is in transit (O) */
+#define	PG_WANTED	0x0002		/* someone is waiting for page (O) */
+#define	PG_FICTITIOUS	0x0008		/* physical page doesn't exist (O) */
+#define	PG_WRITEABLE	0x0010		/* page is mapped writeable */
+#define PG_MAPPED	0x0020		/* page is mapped */
+#define	PG_ZERO		0x0040		/* page is zeroed */
+#define PG_REFERENCED	0x0080		/* page has been referenced */
+#define PG_CLEANCHK	0x0100		/* page will be checked for cleaning */
+#define PG_SWAPINPROG	0x0200		/* swap I/O in progress on page	     */
 
 /*
  * Misc constants.
@@ -307,16 +315,36 @@ vm_page_busy(vm_page_t m)
 	vm_page_flag_set(m, PG_BUSY);
 }
 
+/*
+ *	vm_page_flash:
+ *
+ *	wakeup anyone waiting for the page.
+ */
+
 static __inline void
-vm_page_wakeup(vm_page_t m)
+vm_page_flash(vm_page_t m)
 {
-	vm_page_flag_clear(m, PG_BUSY);
 	if (m->flags & PG_WANTED) {
 		vm_page_flag_clear(m, PG_WANTED);
 		wakeup(m);
 	}
 }
 
+/*
+ *	vm_page_wakeup:
+ *
+ *	clear the PG_BUSY flag and wakeup anyone waiting for the
+ *	page.
+ *
+ */
+
+static __inline void
+vm_page_wakeup(vm_page_t m)
+{
+	vm_page_flag_clear(m, PG_BUSY);
+	vm_page_flash(m);
+}
+
 static __inline void
 vm_page_io_start(vm_page_t m)
 {
@@ -327,10 +355,8 @@ static __inline void
 vm_page_io_finish(vm_page_t m)
 {
 	atomic_subtract_char(&m->busy, 1);
-	if ((m->flags & PG_WANTED) && m->busy == 0) {
-		vm_page_flag_clear(m, PG_WANTED);
-		wakeup(m);
-	}
+	if (m->busy == 0)
+		vm_page_flash(m);
 }
 
 
@@ -353,12 +379,13 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
+static __inline void vm_page_free __P((vm_page_t));
+static __inline void vm_page_free_zero __P((vm_page_t));
+void vm_page_destroy __P((vm_page_t));
 void vm_page_deactivate __P((vm_page_t));
-void vm_page_free __P((vm_page_t));
-void vm_page_free_zero __P((vm_page_t));
 void vm_page_insert __P((vm_page_t, vm_object_t, vm_pindex_t));
 vm_page_t vm_page_lookup __P((vm_object_t, vm_pindex_t));
-void vm_page_remove __P((vm_page_t));
+vm_object_t vm_page_remove __P((vm_page_t));
 void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t));
 vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t));
 void vm_page_unwire __P((vm_page_t, int));
@@ -374,7 +401,11 @@ int vm_page_bits __P((int, int));
 vm_page_t vm_page_list_find __P((int, int));
 int vm_page_queue_index __P((vm_offset_t, int));
 vm_page_t vm_page_select __P((vm_object_t, vm_pindex_t, int));
+#if 0
 int vm_page_sleep(vm_page_t m, char *msg, char *busy);
+int vm_page_asleep(vm_page_t m, char *msg, char *busy);
+#endif
+void vm_page_free_toq(vm_page_t m, int queue);
 
 /*
  * Keep page from being freed by the page daemon
@@ -438,5 +469,64 @@ vm_page_copy(src_m, dest_m)
 	dest_m->valid = VM_PAGE_BITS_ALL;
 }
 
+/*
+ *	vm_page_free:
+ *
+ *	Free a page
+ */
+static __inline void
+vm_page_free(m)
+	vm_page_t m;
+{
+	vm_page_free_toq(m, PQ_FREE);
+}
+
+/*
+ *	vm_page_free_zero:
+ *
+ *	Free a page to the zerod-pages queue
+ */
+static __inline void
+vm_page_free_zero(m)
+	vm_page_t m;
+{
+	vm_page_free_toq(m, PQ_ZERO);
+}
+
+/*
+ *	vm_page_sleep_busy:
+ *
+ *	Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE)
+ *	m->busy is zero.  Returns TRUE if it had to sleep ( including if 
+ *	it almost had to sleep and made temporary spl*() mods), FALSE 
+ *	otherwise.
+ *
+ *	This routine assumes that interrupts can only remove the busy
+ *	status from a page, not set the busy status or change it from
+ *	PG_BUSY to m->busy or vise versa (which would create a timing
+ *	window).
+ *
+ *	Note that being an inline, this code will be well optimized.
+ */
+
+static __inline int
+vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
+{
+	if ((m->flags & PG_BUSY) || (also_m_busy && m->busy))  {
+		int s = splvm();
+		if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) {
+			/*
+			 * Page is busy. Wait and retry.
+			 */
+			vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
+			tsleep(m, PVM, msg, 0);
+		}
+		splx(s);
+		return(TRUE);
+		/* not reached */
+	}
+	return(FALSE);
+}
+
 #endif				/* KERNEL */
 #endif				/* !_VM_PAGE_ */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 606981f..06f24d6 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -65,7 +65,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pageout.c,v 1.128 1998/10/25 17:44:59 phk Exp $
+ * $Id: vm_pageout.c,v 1.129 1998/10/31 17:21:31 peter Exp $
  */
 
 /*
@@ -211,13 +211,10 @@ void pmap_collect(void);
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
- * block.
- * 
- * And we set pageout-in-progress to keep the object from disappearing
- * during pageout.  This guarantees that the page won't move from the
- * inactive queue.  (However, any other page on the inactive queue may
- * move!)
+ * block.  Note the careful timing, however, the busy bit isn't set till
+ * late and we cannot do anything that will mess with the page.
  */
+
 static int
 vm_pageout_clean(m)
 	vm_page_t m;
@@ -231,12 +228,23 @@ vm_pageout_clean(m)
 	object = m->object;
 
 	/*
+	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
+	 * with the new swapper, but we could have serious problems paging
+	 * out other object types if there is insufficient memory.  
+	 *
+	 * Unfortunately, checking free memory here is far too late, so the
+	 * check has been moved up a procedural level.
+	 */
+
+#if 0
+	/*
 	 * If not OBJT_SWAP, additional memory may be needed to do the pageout.
 	 * Try to avoid the deadlock.
 	 */
 	if ((object->type == OBJT_DEFAULT) &&
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min))
 		return 0;
+#endif
 
 	/*
 	 * Don't mess with the page if it's busy.
@@ -245,12 +253,21 @@ vm_pageout_clean(m)
 	    ((m->busy != 0) || (m->flags & PG_BUSY)))
 		return 0;
 
+#if 0
+	/*
+	 * XXX REMOVED XXX.  vm_object_collapse() can block, which can
+	 * change the page state.  Calling vm_object_collapse() might also
+	 * destroy or rename the page because we have not busied it yet!!!
+	 * So this code segment is removed.
+	 */
 	/*
-	 * Try collapsing before it's too late.
+	 * Try collapsing before it's too late.   XXX huh?  Why are we doing
+	 * this here?
 	 */
 	if (object->backing_object) {
 		vm_object_collapse(object);
 	}
+#endif
 
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
@@ -351,6 +368,16 @@ do_backward:
 	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
 }
 
+/*
+ * vm_pageout_flush() - launder the given pages
+ *
+ *	The given pages are laundered.  Note that we setup for the start of
+ *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
+ *	reference count all in here rather then in the parent.  If we want
+ *	the parent to do more sophisticated things we may have to change
+ *	the ordering.
+ */
+
 int
 vm_pageout_flush(mc, count, flags)
 	vm_page_t *mc;
@@ -362,6 +389,14 @@ vm_pageout_flush(mc, count, flags)
 	int numpagedout = 0;
 	int i;
 
+	/*
+	 * Initiate I/O.  Bump the vm_page_t->busy counter and
+	 * mark the pages read-only.
+	 *
+	 * We do not have to fixup the clean/dirty bits here... we can
+	 * allow the pager to do it after the I/O completes.
+	 */
+
 	for (i = 0; i < count; i++) {
 		vm_page_io_start(mc[i]);
 		vm_page_protect(mc[i], VM_PROT_READ);
@@ -585,25 +620,24 @@ vm_pageout_map_deactivate_pages(map, desired)
 }
 #endif
 
+/*
+ * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
+ * to vnode deadlocks.  We only do it for OBJT_DEFAULT and OBJT_SWAP objects
+ * which we know can be trivially freed.
+ */
+
 void
 vm_pageout_page_free(vm_page_t m) {
-	struct vnode *vp;
-	vm_object_t object;
-
-	object = m->object;
-	object->ref_count++;
-
-	if (object->type == OBJT_VNODE) {
-		vp = object->handle;
-		vp->v_usecount++;
-		if (VSHOULDBUSY(vp))
-			vbusy(vp);
-	}
+	vm_object_t object = m->object;
+	int type = object->type;
 
+	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
+		vm_object_reference(object);
 	vm_page_busy(m);
 	vm_page_protect(m, VM_PROT_NONE);
 	vm_page_free(m);
-	vm_object_deallocate(object);
+	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
+		vm_object_deallocate(object);
 }
 
 /*
@@ -613,9 +647,10 @@ static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
-	int page_shortage, addl_page_shortage, maxscan, pcount;
+	int page_shortage, maxscan, pcount;
+	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
-	int pages_freed;
+	int launder_loop = 0;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
@@ -629,31 +664,53 @@ vm_pageout_scan()
 	 */
 	pmap_collect();
 
-	/*
-	 * Start scanning the inactive queue for pages we can free. We keep
-	 * scanning until we have enough free pages or we have scanned through
-	 * the entire queue.  If we encounter dirty pages, we start cleaning
-	 * them.
-	 */
-
-	pages_freed = 0;
-	addl_page_shortage = vm_pageout_deficit;
+	addl_page_shortage_init = vm_pageout_deficit;
 	vm_pageout_deficit = 0;
 
 	if (max_page_launder == 0)
 		max_page_launder = 1;
-	maxlaunder = (cnt.v_inactive_target > max_page_launder) ?
-	    max_page_launder : cnt.v_inactive_target;
 
-rescan0:
-	maxscan = cnt.v_inactive_count;
-	for( m = TAILQ_FIRST(&vm_page_queue_inactive);
+	/*
+	 * Calculate the number of pages we want to either free or move
+	 * to the cache.
+	 */
+
+	page_shortage = (cnt.v_free_target + cnt.v_cache_min) -
+	    (cnt.v_free_count + cnt.v_cache_count);
+	page_shortage += addl_page_shortage_init;
+
+	/*
+	 * Figure out what to do with dirty pages when they are encountered.
+	 * Assume that 1/3 of the pages on the inactive list are clean.  If
+	 * we think we can reach our target, disable laundering (do not
+	 * clean any dirty pages).  If we miss the target we will loop back
+	 * up and do a laundering run.
+	 */
 
-		(m != NULL) && (maxscan-- > 0) &&
-			((cnt.v_cache_count + cnt.v_free_count) <
-			(cnt.v_cache_min + cnt.v_free_target));
+	if (cnt.v_inactive_count / 3 > page_shortage) {
+		maxlaunder = 0;
+		launder_loop = 0;
+	} else {
+		maxlaunder = 
+		    (cnt.v_inactive_target > max_page_launder) ?
+		    max_page_launder : cnt.v_inactive_target;
+		launder_loop = 1;
+	}
 
-		m = next) {
+	/*
+	 * Start scanning the inactive queue for pages we can move to the
+	 * cache or free.  The scan will stop when the target is reached or
+	 * we have scanned the entire inactive queue.
+	 */
+
+rescan0:
+	addl_page_shortage = addl_page_shortage_init;
+	maxscan = cnt.v_inactive_count;
+	for (
+	    m = TAILQ_FIRST(&vm_page_queue_inactive);
+	    m != NULL && maxscan-- > 0 && page_shortage > 0;
+	    m = next
+	) {
 
 		cnt.v_pdpages++;
 
@@ -681,19 +738,21 @@ rescan0:
 		}
 
 		/*
-		 * If the object is not being used, we ignore previous references.
+		 * If the object is not being used, we ignore previous 
+		 * references.
 		 */
 		if (m->object->ref_count == 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 
 		/*
-		 * Otherwise, if the page has been referenced while in the inactive
-		 * queue, we bump the "activation count" upwards, making it less
-		 * likely that the page will be added back to the inactive queue
-		 * prematurely again.  Here we check the page tables (or emulated
-		 * bits, if any), given the upper level VM system not knowing anything
-		 * about existing references.
+		 * Otherwise, if the page has been referenced while in the 
+		 * inactive queue, we bump the "activation count" upwards, 
+		 * making it less likely that the page will be added back to 
+		 * the inactive queue prematurely again.  Here we check the 
+		 * page tables (or emulated bits, if any), given the upper 
+		 * level VM system not knowing anything about existing 
+		 * references.
 		 */
 		} else if (((m->flags & PG_REFERENCED) == 0) &&
 			(actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m)))) {
@@ -703,10 +762,10 @@ rescan0:
 		}
 
 		/*
-		 * If the upper level VM system knows about any page references,
-		 * we activate the page.  We also set the "activation count" higher
-		 * than normal so that we will less likely place pages back onto the
-		 * inactive queue again.
+		 * If the upper level VM system knows about any page 
+		 * references, we activate the page.  We also set the 
+		 * "activation count" higher than normal so that we will less 
+		 * likely place pages back onto the inactive queue again.
 		 */
 		if ((m->flags & PG_REFERENCED) != 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
@@ -717,9 +776,10 @@ rescan0:
 		}
 
 		/*
-		 * If the upper level VM system doesn't know anything about the
-		 * page being dirty, we have to check for it again.  As far as the
-		 * VM code knows, any partially dirty pages are fully dirty.
+		 * If the upper level VM system doesn't know anything about 
+		 * the page being dirty, we have to check for it again.  As 
+		 * far as the VM code knows, any partially dirty pages are 
+		 * fully dirty.
 		 */
 		if (m->dirty == 0) {
 			vm_page_test_dirty(m);
@@ -733,14 +793,14 @@ rescan0:
 		if (m->valid == 0) {
 			vm_pageout_page_free(m);
 			cnt.v_dfree++;
-			pages_freed++;
+			--page_shortage;
 
 		/*
 		 * Clean pages can be placed onto the cache queue.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
-			pages_freed++;
+			--page_shortage;
 
 		/*
 		 * Dirty pages need to be paged out.  Note that we clean
@@ -763,8 +823,8 @@ rescan0:
 			}
 
 			/*
-			 * We don't bother paging objects that are "dead".  Those
-			 * objects are in a "rundown" state.
+			 * We don't bother paging objects that are "dead".  
+			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				s = splvm();
@@ -774,10 +834,61 @@ rescan0:
 				continue;
 			}
 
-			if ((object->type == OBJT_VNODE) &&
-				(object->flags & OBJ_DEAD) == 0) {
+			/*
+			 * For now we protect against potential memory
+			 * deadlocks by requiring significant memory to be 
+			 * free if the object is not OBJT_DEFAULT or OBJT_SWAP.
+			 * We do not 'trust' any other object type to operate
+			 * with low memory, not even OBJT_DEVICE.  The VM
+			 * allocator will special case allocations done by
+			 * the pageout daemon so the check below actually 
+			 * does have some hysteresis in it.  It isn't the best
+			 * solution, though.
+			 */
+
+			if (
+			    object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    cnt.v_free_count < cnt.v_free_reserved
+			) {
+				s = splvm();
+				TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
+				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
+				splx(s);
+				continue;
+			}
+
+			/*
+			 * Presumably we have sufficient free memory to do
+			 * the more sophisticated checks and locking required
+			 * for vnodes.
+			 *
+			 * The object is already known NOT to be dead.  The
+			 * vget() may still block, though, because 
+			 * VOP_ISLOCKED() doesn't check to see if an inode
+			 * (v_data) is associated with the vnode.  If it isn't,
+			 * vget() will load in it from disk.  Worse, vget()
+			 * may actually get stuck waiting on "inode" if another
+			 * process is in the process of bringing the inode in.
+			 * This is bad news for us either way.
+			 *
+			 * So for the moment we check v_data == NULL as a
+			 * workaround.  This means that vnodes which do not
+			 * use v_data in the way we expect probably will not
+			 * wind up being paged out by the pager and it will be
+			 * up to the syncer to get them.  That's better then
+			 * us blocking here.
+			 *
+			 * This whole code section is bogus - we need to fix
+			 * the vnode pager to handle vm_page_t's without us
+			 * having to do any sophisticated VOP tests.
+			 */
+
+			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
+
 				if (VOP_ISLOCKED(vp) ||
+				    vp->v_data == NULL ||
 				    vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
 					if ((m->queue == PQ_INACTIVE) &&
 						(m->hold_count == 0) &&
@@ -844,19 +955,34 @@ rescan0:
 	}
 
 	/*
-	 * Compute the page shortage.  If we are still very low on memory be
-	 * sure that we will move a minimal amount of pages from active to
-	 * inactive.
+	 * If we still have a page shortage and we didn't launder anything,
+	 * run the inactive scan again and launder something this time.
+	 */
+
+	if (launder_loop == 0 && page_shortage > 0) {
+		launder_loop = 1;
+		maxlaunder = 
+		    (cnt.v_inactive_target > max_page_launder) ?
+		    max_page_launder : cnt.v_inactive_target;
+		goto rescan0;
+	}
+
+	/*
+	 * Compute the page shortage from the point of view of having to
+	 * move pages from the active queue to the inactive queue.
 	 */
+
 	page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 	page_shortage += addl_page_shortage;
-	if (page_shortage <= 0) {
-		page_shortage = 0;
-	}
+
+	/*
+	 * Scan the active queue for things we can deactivate
+	 */
 
 	pcount = cnt.v_active_count;
 	m = TAILQ_FIRST(&vm_page_queue_active);
+
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
 		/*
@@ -943,10 +1069,14 @@ rescan0:
 	}
 
 	s = splvm();
+
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
-	 * code to be guaranteed space.
+	 * code to be guaranteed space.  Since both cache and free queues 
+	 * are considered basically 'free', moving pages from cache to free
+	 * does not effect other calculations.
 	 */
+
 	while (cnt.v_free_count < cnt.v_free_reserved) {
 		static int cache_rover = 0;
 		m = vm_page_list_find(PQ_CACHE, cache_rover);
@@ -995,7 +1125,6 @@ rescan0:
 #endif
 	}
 
-
 	/*
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
@@ -1242,10 +1371,8 @@ vm_pageout()
 			cnt.v_pdwakeups++;
 		vm_pages_needed = 0;
 		splx(s);
-		vm_pager_sync();
 		vm_pageout_scan();
 		vm_pageout_deficit = 0;
-		vm_pager_sync();
 		wakeup(&cnt.v_free_count);
 	}
 }
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index a864896..68c0561 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pageout.h,v 1.22 1998/01/12 01:44:46 dyson Exp $
+ * $Id: vm_pageout.h,v 1.23 1998/01/22 17:30:43 dyson Exp $
  */
 
 #ifndef _VM_VM_PAGEOUT_H_
@@ -100,7 +100,9 @@ extern int vm_pageout_deficit;
 
 extern void pagedaemon_wakeup __P((void));
 #define VM_WAIT vm_wait()
+#define VM_AWAIT vm_await()
 extern void vm_wait __P((void));
+extern void vm_await __P((void));
 
 #ifdef KERNEL
 void vm_pageout_page __P((vm_page_t, vm_object_t));
diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c
index 18df05d..62fe6e8 100644
--- a/sys/vm/vm_pager.c
+++ b/sys/vm/vm_pager.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pager.c,v 1.39 1998/10/31 15:31:29 peter Exp $
+ * $Id: vm_pager.c,v 1.40 1998/11/10 09:16:27 peter Exp $
  */
 
 /*
@@ -91,6 +91,8 @@ extern struct pagerops swappagerops;
 extern struct pagerops vnodepagerops;
 extern struct pagerops devicepagerops;
 
+int cluster_pbuf_freecnt = -1;	/* unlimited to begin with */
+
 static int dead_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static vm_object_t dead_pager_alloc __P((void *, vm_ooffset_t, vm_prot_t,
 	vm_ooffset_t));
@@ -164,14 +166,15 @@ struct pagerops deadpagerops = {
 	NULL
 };
 
-static struct pagerops *pagertab[] = {
+struct pagerops *pagertab[] = {
 	&defaultpagerops,	/* OBJT_DEFAULT */
 	&swappagerops,		/* OBJT_SWAP */
 	&vnodepagerops,		/* OBJT_VNODE */
 	&devicepagerops,	/* OBJT_DEVICE */
 	&deadpagerops		/* OBJT_DEAD */
 };
-static int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
+
+int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
 
 /*
  * Kernel address space for mapping pages.
@@ -217,6 +220,8 @@ vm_pager_bufferinit()
 		bp->b_xflags = 0;
 	}
 
+	cluster_pbuf_freecnt = nswbuf / 2;
+
 	swapbkva = kmem_alloc_pageable(pager_map, nswbuf * MAXPHYS);
 	if (!swapbkva)
 		panic("Not enough pager_map VM space for physical buffers");
@@ -246,41 +251,21 @@ vm_pager_deallocate(object)
 	(*pagertab[object->type]->pgo_dealloc) (object);
 }
 
+/*
+ * vm_pager_get_pages() - inline, see vm/vm_pager.h
+ * vm_pager_put_pages() - inline, see vm/vm_pager.h
+ * vm_pager_has_page() - inline, see vm/vm_pager.h
+ * vm_pager_page_inserted() - inline, see vm/vm_pager.h
+ * vm_pager_page_removed() - inline, see vm/vm_pager.h
+ */
 
-int
-vm_pager_get_pages(object, m, count, reqpage)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int reqpage;
-{
-	return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
-}
-
-int
-vm_pager_put_pages(object, m, count, flags, rtvals)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int flags;
-	int *rtvals;
-{
-	return ((*pagertab[object->type]->pgo_putpages)(object, m, count, flags, rtvals));
-}
-
-boolean_t
-vm_pager_has_page(object, offset, before, after)
-	vm_object_t object;
-	vm_pindex_t offset;
-	int *before;
-	int *after;
-{
-	return ((*pagertab[object->type]->pgo_haspage) (object, offset, before, after));
-}
-
+#if 0
 /*
- * Called by pageout daemon before going back to sleep.
- * Gives pagers a chance to clean up any completed async pageing operations.
+ *	vm_pager_sync:
+ *
+ *	Called by pageout daemon before going back to sleep.
+ *	Gives pagers a chance to clean up any completed async pageing 
+ *	operations.
  */
 void
 vm_pager_sync()
@@ -292,6 +277,8 @@ vm_pager_sync()
 			(*(*pgops)->pgo_sync) ();
 }
 
+#endif
+
 vm_offset_t
 vm_pager_map_page(m)
 	vm_page_t m;
@@ -342,20 +329,42 @@ initpbuf(struct buf *bp) {
 
 /*
  * allocate a physical buffer
+ *
+ *	There are a limited number (nswbuf) of physical buffers.  We need
+ *	to make sure that no single subsystem is able to hog all of them,
+ *	so each subsystem implements a counter which is typically initialized
+ *	to 1/2 nswbuf.  getpbuf() decrements this counter in allocation and
+ *	increments it on release, and blocks if the counter hits zero.  A
+ *	subsystem may initialize the counter to -1 to disable the feature,
+ *	but it must still be sure to match up all uses of getpbuf() with 
+ *	relpbuf() using the same variable.
+ *
+ *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
+ *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 struct buf *
-getpbuf()
+getpbuf(pfreecnt)
+	int *pfreecnt;
 {
 	int s;
 	struct buf *bp;
 
 	s = splvm();
+
+	if (pfreecnt) {
+		while (*pfreecnt == 0) {
+			tsleep(pfreecnt, PVM, "wswbuf0", 0);
+		}
+	}
+
 	/* get a bp from the swap buffer header pool */
 	while ((bp = TAILQ_FIRST(&bswlist)) == NULL) {
 		bswneeded = 1;
-		tsleep(&bswneeded, PVM, "wswbuf", 0);
+		tsleep(&bswneeded, PVM, "wswbuf1", 0);
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
+	if (pfreecnt)
+		--*pfreecnt;
 	splx(s);
 
 	initpbuf(bp);
@@ -363,20 +372,27 @@ getpbuf()
 }
 
 /*
- * allocate a physical buffer, if one is available
+ * allocate a physical buffer, if one is available.
+ *
+ *	Note that there is no NULL hack here - all subsystems using this
+ *	call understand how to use pfreecnt.
  */
 struct buf *
-trypbuf()
+trypbuf(pfreecnt)
+	int *pfreecnt;
 {
 	int s;
 	struct buf *bp;
 
 	s = splvm();
-	if ((bp = TAILQ_FIRST(&bswlist)) == NULL) {
+	if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) {
 		splx(s);
 		return NULL;
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
+
+	--*pfreecnt;
+
 	splx(s);
 
 	initpbuf(bp);
@@ -386,10 +402,14 @@ trypbuf()
 
 /*
  * release a physical buffer
+ *
+ *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
+ *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 void
-relpbuf(bp)
+relpbuf(bp, pfreecnt)
 	struct buf *bp;
+	int *pfreecnt;
 {
 	int s;
 
@@ -403,6 +423,7 @@ relpbuf(bp)
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
+
 	if (bp->b_vp)
 		pbrelvp(bp);
 
@@ -415,5 +436,9 @@ relpbuf(bp)
 		bswneeded = 0;
 		wakeup(&bswneeded);
 	}
+	if (pfreecnt) {
+		if (++*pfreecnt == 1)
+			wakeup(pfreecnt);
+	}
 	splx(s);
 }
diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h
index 6b8eb42..0e8d894 100644
--- a/sys/vm/vm_pager.h
+++ b/sys/vm/vm_pager.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $Id: vm_pager.h,v 1.16 1998/03/07 21:37:27 dyson Exp $
+ * $Id: vm_pager.h,v 1.17 1998/10/13 08:24:44 dg Exp $
  */
 
 /*
@@ -57,7 +57,7 @@ struct pagerops {
 	int (*pgo_getpages) __P((vm_object_t, vm_page_t *, int, int));	/* Get (read) page. */
 	int (*pgo_putpages) __P((vm_object_t, vm_page_t *, int, int, int *)); /* Put (write) page. */
 	boolean_t (*pgo_haspage) __P((vm_object_t, vm_pindex_t, int *, int *)); /* Does pager have page? */
-	void (*pgo_sync) __P((void));
+	void (*pgo_pageunswapped) __P((vm_page_t));
 };
 
 /*
@@ -87,20 +87,69 @@ MALLOC_DECLARE(M_VMPGDATA);
 
 extern vm_map_t pager_map;
 extern int pager_map_size;
+extern struct pagerops *pagertab[];
 
 vm_object_t vm_pager_allocate __P((objtype_t, void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t));
 void vm_pager_bufferinit __P((void));
 void vm_pager_deallocate __P((vm_object_t));
-int vm_pager_get_pages __P((vm_object_t, vm_page_t *, int, int));
-boolean_t vm_pager_has_page __P((vm_object_t, vm_pindex_t, int *, int *));
+static __inline int vm_pager_get_pages __P((vm_object_t, vm_page_t *, int, int));
+static __inline boolean_t vm_pager_has_page __P((vm_object_t, vm_pindex_t, int *, int *));
 void vm_pager_init __P((void));
 vm_object_t vm_pager_object_lookup __P((struct pagerlst *, void *));
 vm_offset_t vm_pager_map_pages __P((vm_page_t *, int, boolean_t));
 vm_offset_t vm_pager_map_page __P((vm_page_t));
-int vm_pager_put_pages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
+static __inline int vm_pager_put_pages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
 void vm_pager_sync __P((void));
 void vm_pager_unmap_pages __P((vm_offset_t, int));
 void vm_pager_unmap_page __P((vm_offset_t));
+
+static __inline int
+vm_pager_get_pages(
+	vm_object_t object,
+	vm_page_t *m,
+	int count,
+	int reqpage
+) {
+	return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
+}
+
+static __inline int
+vm_pager_put_pages(
+	vm_object_t object,
+	vm_page_t *m,
+	int count,
+	int flags,
+	int *rtvals
+) {
+	return ((*pagertab[object->type]->pgo_putpages)(object, m, count, flags, rtvals));
+}
+
+static __inline boolean_t
+vm_pager_has_page(
+	vm_object_t object,
+	vm_pindex_t offset, 
+	int *before,
+	int *after
+) {
+        return ((*pagertab[object->type]->pgo_haspage) (object, offset, before, after));
+} 
+
+/* 
+ *      vm_pager_page_unswapped
+ * 
+ *      called at splvm() to destroy swap associated with the page.
+ * 
+ *      This function may not block.
+ */
+ 
+static __inline void
+vm_pager_page_unswapped(vm_page_t m)
+{
+	if (pagertab[m->object->type]->pgo_pageunswapped)
+		(*pagertab[m->object->type]->pgo_pageunswapped)(m);
+}
+
+
 #endif
 
 #endif				/* _VM_PAGER_ */
diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c
index bfcebdc..f973631 100644
--- a/sys/vm/vm_swap.c
+++ b/sys/vm/vm_swap.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
- * $Id: vm_swap.c,v 1.56 1998/07/04 22:30:26 julian Exp $
+ * $Id: vm_swap.c,v 1.57 1998/10/25 19:24:04 bde Exp $
  */
 
 #include "opt_devfs.h"
@@ -50,7 +50,7 @@
 #include <sys/dmap.h>		/* XXX */
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
-#include <sys/rlist.h>
+#include <sys/blist.h>
 #include <sys/kernel.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -94,8 +94,7 @@ static dev_t	swapdev = makedev(BDEV_MAJOR, 0);
 static struct swdevt should_be_malloced[NSWAPDEV];
 static struct swdevt *swdevt = should_be_malloced;
 struct vnode *swapdev_vp;
-/* XXX swapinfo(8) needs this one I belive */
-int nswap;			/* first block after the interleaved devs */
+static int nswap;		/* first block after the interleaved devs */
 static int nswdev = NSWAPDEV;
 int vm_swap_size;
 
@@ -119,7 +118,13 @@ swstrategy(bp)
 	register struct swdevt *sp;
 	struct vnode *vp;
 
-	sz = howmany(bp->b_bcount, DEV_BSIZE);
+	sz = howmany(bp->b_bcount, PAGE_SIZE);
+	/*
+	 * Convert interleaved swap into per-device swap.  Note that
+	 * the block size is left in PAGE_SIZE'd chunks (for the newswap)
+	 * here.
+	 */
+
 	if (nswdev > 1) {
 		off = bp->b_blkno % dmmax;
 		if (off + sz > dmmax) {
@@ -132,8 +137,9 @@ swstrategy(bp)
 		index = seg % nswdev;
 		seg /= nswdev;
 		bp->b_blkno = seg * dmmax + off;
-	} else
+	} else {
 		index = 0;
+	}
 	sp = &swdevt[index];
 	if (bp->b_blkno + sz > sp->sw_nblks) {
 		bp->b_error = EINVAL;
@@ -148,6 +154,12 @@ swstrategy(bp)
 		biodone(bp);
 		return;
 	}
+
+	/*
+	 * Convert from PAGE_SIZE'd to DEV_BSIZE'd chunks for the actual I/O
+	 */
+	bp->b_blkno = ctodb(bp->b_blkno);
+
 	vhold(sp->sw_vp);
 	s = splvm();
 	if ((bp->b_flags & B_READ) == 0) {
@@ -161,10 +173,8 @@ swstrategy(bp)
 		}
 		sp->sw_vp->v_numoutput++;
 	}
-	if (bp->b_vp != NULL)
-		pbrelvp(bp);
+	pbreassignbuf(bp, sp->sw_vp);
 	splx(s);
-	bp->b_vp = sp->sw_vp;
 	VOP_STRATEGY(bp->b_vp, bp);
 }
 
@@ -240,6 +250,11 @@ swapon(p, uap)
  * Each of the nswdev devices provides 1/nswdev'th of the swap
  * space, which is laid out with blocks of dmmax pages circularly
  * among the devices.
+ *
+ * The new swap code uses page-sized blocks.  The old swap code used
+ * DEV_BSIZE'd chunks.
+ *
+ * XXX locking when multiple swapon's run in parallel
  */
 int
 swaponvp(p, vp, dev, nblks)
@@ -277,18 +292,37 @@ swaponvp(p, vp, dev, nblks)
 		(void) VOP_CLOSE(vp, FREAD | FWRITE, p->p_ucred, p);
 		return (ENXIO);
 	}
+	/*
+	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
+	 * First chop nblks off to page-align it, then convert.
+	 * 
+	 * sw->sw_nblks is in page-sized chunks now too.
+	 */
+	nblks &= ~(ctodb(1) - 1);
+	nblks = dbtoc(nblks);
+
 	sp->sw_vp = vp;
 	sp->sw_dev = dev;
 	sp->sw_flags |= SW_FREED;
 	sp->sw_nblks = nblks;
 
+	/*
+	 * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not
+	 * DEV_BSIZE'd. 
+	 */
+
 	if (nblks * nswdev > nswap)
 		nswap = (nblks+1) * nswdev;
 
+	if (swapblist == NULL)
+		swapblist = blist_create(nswap);
+	else
+		blist_resize(&swapblist, nswap, 0);
+
 	for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
-		blk = min(nblks - dvbase,dmmax);
+		blk = min(nblks - dvbase, dmmax);
 		vsbase = index * dmmax + dvbase * nswdev;
-		rlist_free(&swaplist, vsbase, vsbase + blk - 1);
+		blist_free(swapblist, vsbase, blk);
 		vm_swap_size += blk;
 	}
 
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index fba7e2f..fe04da4 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
- *	$Id: vnode_pager.c,v 1.100 1998/10/13 08:24:44 dg Exp $
+ *	$Id: vnode_pager.c,v 1.101 1998/12/04 18:39:44 rvb Exp $
  */
 
 /*
@@ -88,6 +88,8 @@ struct pagerops vnodepagerops = {
 	NULL
 };
 
+int vnode_pbuf_freecnt = -1;	/* start out unlimited */
+
 
 /*
  * Allocate (or lookup) pager for a vnode.
@@ -106,6 +108,13 @@ vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 	if (handle == NULL)
 		return (NULL);
 
+	/*
+	 * XXX hack - This initialization should be put somewhere else.
+	 */
+	if (vnode_pbuf_freecnt < 0) {
+	    vnode_pbuf_freecnt = nswbuf / 2 + 1;
+	}
+
 	vp = (struct vnode *) handle;
 
 	/*
@@ -395,7 +404,7 @@ vnode_pager_input_smlfs(object, m)
 		fileaddr = vnode_pager_addr(vp,
 			IDX_TO_OFF(m->pindex) + i * bsize, (int *)0);
 		if (fileaddr != -1) {
-			bp = getpbuf();
+			bp = getpbuf(&vnode_pbuf_freecnt);
 
 			/* build a minimal buffer header */
 			bp->b_flags = B_BUSY | B_READ | B_CALL;
@@ -428,7 +437,7 @@ vnode_pager_input_smlfs(object, m)
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
-			relpbuf(bp);
+			relpbuf(bp, &vnode_pbuf_freecnt);
 			if (error)
 				break;
 
@@ -707,7 +716,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 	if (dp->v_type == VBLK || dp->v_type == VCHR)
 		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(&vnode_pbuf_freecnt);
 	kva = (vm_offset_t) bp->b_data;
 
 	/*
@@ -755,7 +764,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 	/*
 	 * free the buffer header back to the swap buffer pool
 	 */
-	relpbuf(bp);
+	relpbuf(bp, &vnode_pbuf_freecnt);
 
 	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
 		vm_page_t mt;
author	dillon <dillon@FreeBSD.org>	1999-01-21 08:29:12 +0000
committer	dillon <dillon@FreeBSD.org>	1999-01-21 08:29:12 +0000
commit	df24433bbe29112b4b9c9f38e80ba6cfb6988cb0 (patch)
tree	b0a91cf38166034e837b98d5edacd8177a14aba6
parent	bae5debf723220e076d6a9696e417805639cdc3a (diff)
download	FreeBSD-src-df24433bbe29112b4b9c9f38e80ba6cfb6988cb0.zip FreeBSD-src-df24433bbe29112b4b9c9f38e80ba6cfb6988cb0.tar.gz