63 files changed, 3198 insertions, 2062 deletions
diff --git a/sys/alpha/alpha/pmap.c b/sys/alpha/alpha/pmap.c
index 0e7aa73..fe8741d 100644
--- a/sys/alpha/alpha/pmap.c
+++ b/sys/alpha/alpha/pmap.c
@@ -43,7 +43,7 @@
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  *	from:	i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp
  *		with some ideas from NetBSD's alpha pmap
- *	$Id: pmap.c,v 1.11 1998/10/21 11:38:06 dg Exp $
+ *	$Id: pmap.c,v 1.12 1998/10/28 13:36:49 dg Exp $
  */
 
 /*
@@ -950,7 +950,7 @@ pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
-	if (m && vm_page_sleep(m, "pplookp", NULL))
+	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
 		goto retry;
 	return m;
 }
@@ -1039,7 +1039,7 @@ pmap_dispose_proc(p)
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 
-		vm_page_flag_set(m, PG_BUSY);
+		vm_page_busy(m);
 
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
@@ -1128,7 +1128,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	int s;
 
-	while (vm_page_sleep(m, "pmuwpt", NULL));
+	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
+		;
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
@@ -1181,7 +1182,7 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
 				wakeup(m);
 			}
 
-			vm_page_flag_set(m, PG_BUSY);
+			vm_page_busy(m);
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
@@ -1316,10 +1317,10 @@ pmap_release_free_page(pmap_t pmap, vm_page_t p)
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
-	if (vm_page_sleep(p, "pmaprl", NULL))
+	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
 		return 0;
 
-	vm_page_flag_set(p, PG_BUSY);
+	vm_page_busy(p);
 
 	/*
 	 * Remove the page table page from the processes address space.
@@ -2336,7 +2337,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				vm_page_flag_set(p, PG_BUSY);
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + alpha_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
@@ -2356,7 +2357,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				vm_page_flag_set(p, PG_BUSY);
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + alpha_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
@@ -2453,7 +2454,7 @@ pmap_prefault(pmap, addra, entry)
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
-			vm_page_flag_set(m, PG_BUSY);
+			vm_page_busy(m);
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
 			vm_page_flag_set(m, PG_MAPPED);
diff --git a/sys/alpha/alpha/symbols.raw b/sys/alpha/alpha/symbols.raw
index bf8881a..2b03da9 100644
--- a/sys/alpha/alpha/symbols.raw
+++ b/sys/alpha/alpha/symbols.raw
@@ -1,6 +1,6 @@
 #	@(#)symbols.raw	7.6 (Berkeley) 5/8/91
 #
-#	$Id: symbols.raw,v 1.12 1998/03/30 09:48:20 phk Exp $
+#	$Id: symbols.raw,v 1.1 1998/06/10 10:53:25 dfr Exp $
 #
 
 
@@ -34,7 +34,8 @@
 #pstat
 #	_cons
 	_nswap
-	_swaplist
+	_swapblist
+#	_swaplist
 #vmstat
 	_cp_time
 #	_rate
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 66c9b63..2a378d3 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -39,7 +39,7 @@
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.218 1999/01/09 21:41:22 dt Exp $
+ *	$Id: pmap.c,v 1.219 1999/01/12 00:17:53 eivind Exp $
  */
 
 /*
@@ -942,7 +942,7 @@ pmap_page_lookup(object, pindex)
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
-	if (m && vm_page_sleep(m, "pplookp", NULL))
+	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
 		goto retry;
 	return m;
 }
@@ -1009,8 +1009,8 @@ pmap_new_proc(p)
 		}
 
 		vm_page_wakeup(m);
-		m->flags &= ~PG_ZERO;
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_clear(m, PG_ZERO);
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 	if (updateneeded)
@@ -1038,7 +1038,7 @@ pmap_dispose_proc(p)
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 
-		m->flags |= PG_BUSY;
+		vm_page_busy(m);
 
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
@@ -1107,7 +1107,7 @@ pmap_swapin_proc(p)
 
 		vm_page_wire(m);
 		vm_page_wakeup(m);
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 	}
 }
 
@@ -1122,7 +1122,8 @@ pmap_swapin_proc(p)
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 
-	while (vm_page_sleep(m, "pmuwpt", NULL));
+	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
+		;
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
@@ -1150,12 +1151,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 		--m->wire_count;
 		if (m->wire_count == 0) {
 
-			if (m->flags & PG_WANTED) {
-				m->flags &= ~PG_WANTED;
-				wakeup(m);
-			}
-
-			m->flags |= PG_BUSY;
+			vm_page_flash(m);
+			vm_page_busy(m);
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
@@ -1257,7 +1254,8 @@ pmap_pinit(pmap)
 	ptdpg->wire_count = 1;
 	++cnt.v_wire_count;
 
-	ptdpg->flags &= ~(PG_MAPPED | PG_BUSY);	/* not mapped normally */
+
+	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
 	ptdpg->valid = VM_PAGE_BITS_ALL;
 
 	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
@@ -1290,10 +1288,10 @@ pmap_release_free_page(pmap, p)
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
-	if (vm_page_sleep(p, "pmaprl", NULL))
+	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
 		return 0;
 
-	p->flags |= PG_BUSY;
+	vm_page_busy(p);
 
 	/*
 	 * Remove the page table page from the processes address space.
@@ -1393,8 +1391,9 @@ _pmap_allocpte(pmap, ptepindex)
 	}
 
 	m->valid = VM_PAGE_BITS_ALL;
-	m->flags &= ~(PG_ZERO | PG_BUSY);
-	m->flags |= PG_MAPPED;
+	vm_page_flag_clear(m, PG_ZERO);
+	vm_page_flag_set(m, PG_MAPPED);
+	vm_page_wakeup(m);
 
 	return m;
 }
@@ -1713,7 +1712,7 @@ pmap_remove_entry(pmap, ppv, va)
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		ppv->pv_list_count--;
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL)
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
@@ -1791,7 +1790,7 @@ pmap_remove_pte(pmap, ptq, va)
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		if (oldpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 		return pmap_remove_entry(pmap, ppv, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
@@ -1976,7 +1975,7 @@ pmap_remove_all(pa)
 			pv->pv_pmap->pm_stats.wired_count--;
 
 		if (tpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
@@ -2005,7 +2004,7 @@ pmap_remove_all(pa)
 		free_pv_entry(pv);
 	}
 
-	ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+	vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 	if (update_needed)
 		invltlb();
@@ -2081,7 +2080,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 				ppv = NULL;
 				if (pbits & PG_A) {
 					ppv = pa_to_pvh(pbits);
-					ppv->pv_vm_page->flags |= PG_REFERENCED;
+					vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if (pbits & PG_M) {
@@ -2436,7 +2435,7 @@ pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
 
 retry:
 		p = vm_page_lookup(object, pindex);
-		if (p && vm_page_sleep(p, "init4p", NULL))
+		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
 			goto retry;
 
 		if (p == NULL) {
@@ -2469,7 +2468,7 @@ retry:
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
-		p->flags |= PG_MAPPED;
+		vm_page_flag_set(p, PG_MAPPED);
 		invltlb();
 		return;
 	}
@@ -2510,11 +2509,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 			objpgs -= 1;
@@ -2531,11 +2530,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 		}
@@ -2628,10 +2627,10 @@ pmap_prefault(pmap, addra, entry)
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
-			m->flags |= PG_BUSY;
+			vm_page_busy(m);
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
-			m->flags |= PG_MAPPED;
+			vm_page_flag_set(m, PG_MAPPED);
 			vm_page_wakeup(m);
 		}
 	}
@@ -3026,7 +3025,7 @@ pmap_remove_pages(pmap, sva, eva)
 		ppv->pv_list_count--;
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
@@ -3406,7 +3405,7 @@ pmap_mincore(pmap, addr)
 		 */
 		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) {
 			val |= MINCORE_REFERENCED_OTHER;
-			m->flags |= PG_REFERENCED;
+			vm_page_flag_set(m, PG_REFERENCED);
 		}
 	} 
 	return val;
diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c
index 57ac533..e4be47f 100644
--- a/sys/cam/cam_periph.c
+++ b/sys/cam/cam_periph.c
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *      $Id: cam_periph.c,v 1.8 1998/12/16 21:00:06 ken Exp $
+ *      $Id: cam_periph.c,v 1.9 1999/01/14 06:21:54 jdp Exp $
  */
 
 #include <sys/param.h>
@@ -599,7 +599,7 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
 		/*
 		 * Get the buffer.
 		 */
-		mapinfo->bp[i] = getpbuf();
+		mapinfo->bp[i] = getpbuf(NULL);
 
 		/* save the buffer's data address */
 		mapinfo->bp[i]->b_saveaddr = mapinfo->bp[i]->b_data;
@@ -674,7 +674,7 @@ cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
 		mapinfo->bp[i]->b_flags &= ~(B_PHYS|B_BUSY);
 
 		/* release the buffer */
-		relpbuf(mapinfo->bp[i]);
+		relpbuf(mapinfo->bp[i], NULL);
 	}
 
 	/* allow ourselves to be swapped once again */
diff --git a/sys/conf/files b/sys/conf/files
index 795f6f8..02a281b 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -306,6 +306,7 @@ kern/subr_module.c	standard
 kern/subr_prf.c		standard
 kern/subr_prof.c	standard
 kern/subr_rlist.c	standard
+kern/subr_blist.c	standard
 kern/subr_scanf.c	standard
 kern/subr_xxx.c		standard
 kern/sys_generic.c	standard
diff --git a/sys/conf/options b/sys/conf/options
index 35ceb1a..6dfc0cc 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -1,4 +1,4 @@
-#	$Id: options,v 1.120 1999/01/17 19:02:39 peter Exp $
+#	$Id: options,v 1.121 1999/01/20 14:49:07 eivind Exp $
 #
 #        On the handling of kernel options   
 # 
@@ -209,6 +209,7 @@ TCPDEBUG
 IPFILTER		opt_ipfilter.h
 IPFILTER_LOG		opt_ipfilter.h
 IPFILTER_LKM		opt_ipfilter.h
+SLIP_IFF_OPTS		opt_slip.h
 
 # ATM (HARP version)
 ATM_CORE		opt_atm.h
diff --git a/sys/fs/procfs/procfs_map.c b/sys/fs/procfs/procfs_map.c
index 4dae10a..c6b8966 100644
--- a/sys/fs/procfs/procfs_map.c
+++ b/sys/fs/procfs/procfs_map.c
@@ -36,7 +36,7 @@
  *
  *	@(#)procfs_status.c	8.3 (Berkeley) 2/17/94
  *
- *	$Id: procfs_map.c,v 1.17 1998/04/29 04:28:22 dyson Exp $
+ *	$Id: procfs_map.c,v 1.18 1998/12/04 22:54:51 archie Exp $
  */
 
 #include <sys/param.h>
@@ -93,7 +93,7 @@ procfs_domap(curp, p, pfs, uio)
 		((uio->uio_resid > 0) && (entry != &map->header));
 		entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
-		int ref_count, shadow_count, id, flags;
+		int ref_count, shadow_count, flags;
 		vm_offset_t addr;
 		int resident, privateresident;
 		char *type;
@@ -139,13 +139,11 @@ case OBJT_DEVICE:
 			flags = obj->flags;
 			ref_count = obj->ref_count;
 			shadow_count = obj->shadow_count;
-			id = obj->id;
 		} else {
 			type = "none";
 			flags = 0;
 			ref_count = 0;
 			shadow_count = 0;
-			id = 0;
 		}
 			
 
@@ -154,9 +152,9 @@ case OBJT_DEVICE:
 		 *  start, end, resident, private resident, cow, access, type.
 		 */
 		snprintf(mebuffer, sizeof(mebuffer),
-		    "0x%x 0x%x %d %d %d %s%s%s %d %d 0x%x %s %s %s\n",
+		    "0x%x 0x%x %d %d %p %s%s%s %d %d 0x%x %s %s %s\n",
 			entry->start, entry->end,
-			resident, privateresident, id,
+			resident, privateresident, obj,
 			(entry->protection & VM_PROT_READ)?"r":"-",
 			(entry->protection & VM_PROT_WRITE)?"w":"-",
 			(entry->protection & VM_PROT_EXECUTE)?"x":"-",
diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c
index ff0f347..6096a1b 100644
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
- * $Id: spec_vnops.c,v 1.77 1998/12/07 21:58:33 archie Exp $
+ * $Id: spec_vnops.c,v 1.78 1998/12/16 00:10:51 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -781,7 +781,7 @@ spec_getpages(ap)
 		blksiz = DEV_BSIZE;
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
@@ -894,13 +894,13 @@ spec_getpages(ap)
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
 
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 66c9b63..2a378d3 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -39,7 +39,7 @@
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.218 1999/01/09 21:41:22 dt Exp $
+ *	$Id: pmap.c,v 1.219 1999/01/12 00:17:53 eivind Exp $
  */
 
 /*
@@ -942,7 +942,7 @@ pmap_page_lookup(object, pindex)
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
-	if (m && vm_page_sleep(m, "pplookp", NULL))
+	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
 		goto retry;
 	return m;
 }
@@ -1009,8 +1009,8 @@ pmap_new_proc(p)
 		}
 
 		vm_page_wakeup(m);
-		m->flags &= ~PG_ZERO;
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_clear(m, PG_ZERO);
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 	if (updateneeded)
@@ -1038,7 +1038,7 @@ pmap_dispose_proc(p)
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 
-		m->flags |= PG_BUSY;
+		vm_page_busy(m);
 
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
@@ -1107,7 +1107,7 @@ pmap_swapin_proc(p)
 
 		vm_page_wire(m);
 		vm_page_wakeup(m);
-		m->flags |= PG_MAPPED | PG_WRITEABLE;
+		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 	}
 }
 
@@ -1122,7 +1122,8 @@ pmap_swapin_proc(p)
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 
-	while (vm_page_sleep(m, "pmuwpt", NULL));
+	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
+		;
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
@@ -1150,12 +1151,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 		--m->wire_count;
 		if (m->wire_count == 0) {
 
-			if (m->flags & PG_WANTED) {
-				m->flags &= ~PG_WANTED;
-				wakeup(m);
-			}
-
-			m->flags |= PG_BUSY;
+			vm_page_flash(m);
+			vm_page_busy(m);
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
@@ -1257,7 +1254,8 @@ pmap_pinit(pmap)
 	ptdpg->wire_count = 1;
 	++cnt.v_wire_count;
 
-	ptdpg->flags &= ~(PG_MAPPED | PG_BUSY);	/* not mapped normally */
+
+	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
 	ptdpg->valid = VM_PAGE_BITS_ALL;
 
 	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
@@ -1290,10 +1288,10 @@ pmap_release_free_page(pmap, p)
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
-	if (vm_page_sleep(p, "pmaprl", NULL))
+	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
 		return 0;
 
-	p->flags |= PG_BUSY;
+	vm_page_busy(p);
 
 	/*
 	 * Remove the page table page from the processes address space.
@@ -1393,8 +1391,9 @@ _pmap_allocpte(pmap, ptepindex)
 	}
 
 	m->valid = VM_PAGE_BITS_ALL;
-	m->flags &= ~(PG_ZERO | PG_BUSY);
-	m->flags |= PG_MAPPED;
+	vm_page_flag_clear(m, PG_ZERO);
+	vm_page_flag_set(m, PG_MAPPED);
+	vm_page_wakeup(m);
 
 	return m;
 }
@@ -1713,7 +1712,7 @@ pmap_remove_entry(pmap, ppv, va)
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		ppv->pv_list_count--;
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL)
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
@@ -1791,7 +1790,7 @@ pmap_remove_pte(pmap, ptq, va)
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		if (oldpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 		return pmap_remove_entry(pmap, ppv, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
@@ -1976,7 +1975,7 @@ pmap_remove_all(pa)
 			pv->pv_pmap->pm_stats.wired_count--;
 
 		if (tpte & PG_A)
-			ppv->pv_vm_page->flags |= PG_REFERENCED;
+			vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
@@ -2005,7 +2004,7 @@ pmap_remove_all(pa)
 		free_pv_entry(pv);
 	}
 
-	ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+	vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 
 	if (update_needed)
 		invltlb();
@@ -2081,7 +2080,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 				ppv = NULL;
 				if (pbits & PG_A) {
 					ppv = pa_to_pvh(pbits);
-					ppv->pv_vm_page->flags |= PG_REFERENCED;
+					vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if (pbits & PG_M) {
@@ -2436,7 +2435,7 @@ pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
 
 retry:
 		p = vm_page_lookup(object, pindex);
-		if (p && vm_page_sleep(p, "init4p", NULL))
+		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
 			goto retry;
 
 		if (p == NULL) {
@@ -2469,7 +2468,7 @@ retry:
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
-		p->flags |= PG_MAPPED;
+		vm_page_flag_set(p, PG_MAPPED);
 		invltlb();
 		return;
 	}
@@ -2510,11 +2509,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 			objpgs -= 1;
@@ -2531,11 +2530,11 @@ retry:
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
-				p->flags |= PG_BUSY;
+				vm_page_busy(p);
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
-				p->flags |= PG_MAPPED;
+				vm_page_flag_set(p, PG_MAPPED);
 				vm_page_wakeup(p);
 			}
 		}
@@ -2628,10 +2627,10 @@ pmap_prefault(pmap, addra, entry)
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
-			m->flags |= PG_BUSY;
+			vm_page_busy(m);
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
-			m->flags |= PG_MAPPED;
+			vm_page_flag_set(m, PG_MAPPED);
 			vm_page_wakeup(m);
 		}
 	}
@@ -3026,7 +3025,7 @@ pmap_remove_pages(pmap, sva, eva)
 		ppv->pv_list_count--;
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
-			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+			vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
@@ -3406,7 +3405,7 @@ pmap_mincore(pmap, addr)
 		 */
 		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) {
 			val |= MINCORE_REFERENCED_OTHER;
-			m->flags |= PG_REFERENCED;
+			vm_page_flag_set(m, PG_REFERENCED);
 		}
 	} 
 	return val;
diff --git a/sys/i386/i386/symbols.raw b/sys/i386/i386/symbols.raw
index 4703c30..943d8ae 100644
--- a/sys/i386/i386/symbols.raw
+++ b/sys/i386/i386/symbols.raw
@@ -1,6 +1,6 @@
 #	@(#)symbols.raw	7.6 (Berkeley) 5/8/91
 #
-#	$Id: symbols.raw,v 1.12 1998/03/30 09:48:20 phk Exp $
+#	$Id: symbols.raw,v 1.13 1998/09/15 10:03:43 gibbs Exp $
 #
 
 
@@ -28,7 +28,8 @@
 #pstat
 #	_cons
 	_nswap
-	_swaplist
+	_swapblist
+#	_swaplist
 #vmstat
 	_cp_time
 #	_rate
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index a9776a5..be9f9d3 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
- * $Id: kern_malloc.c,v 1.50 1999/01/08 17:31:09 eivind Exp $
+ * $Id: kern_malloc.c,v 1.51 1999/01/10 01:58:24 eivind Exp $
  */
 
 #include "opt_vm.h"
@@ -101,7 +101,16 @@ struct freelist {
 #endif /* INVARIANTS */
 
 /*
- * Allocate a block of memory
+ *	malloc:
+ *
+ *	Allocate a block of memory.
+ *
+ *	If M_NOWAIT is set, this routine will not block and return NULL if
+ *	the allocation fails.
+ *
+ *	If M_ASLEEP is set (M_NOWAIT must also be set), this routine
+ *	will have the side effect of calling asleep() if it returns NULL,
+ *	allowing the parent to await() at some future time.
  */
 void *
 malloc(size, type, flags)
@@ -122,13 +131,26 @@ malloc(size, type, flags)
 #endif
 	register struct malloc_type *ksp = type;
 
-	if (!type->ks_next)
+	/*
+	 * Must be at splmem() prior to initializing segment to handle
+	 * potential initialization race.
+	 */
+
+	s = splmem();
+
+	if (!type->ks_next) {
 		malloc_init(type);
+	}
 
 	indx = BUCKETINDX(size);
 	kbp = &bucket[indx];
-	s = splmem();
+
 	while (ksp->ks_memuse >= ksp->ks_limit) {
+		if (flags & M_ASLEEP) {
+			if (ksp->ks_limblocks < 65535)
+				ksp->ks_limblocks++;
+			asleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0);
+		}
 		if (flags & M_NOWAIT) {
 			splx(s);
 			return ((void *) NULL);
@@ -239,7 +261,11 @@ out:
 }
 
 /*
- * Free a block of memory allocated by malloc.
+ *	free:
+ *
+ *	Free a block of memory allocated by malloc.
+ *
+ *	This routine may not block.
  */
 void
 free(addr, type)
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
index 441d95f..ad63a98 100644
--- a/sys/kern/kern_physio.c
+++ b/sys/kern/kern_physio.c
@@ -16,7 +16,7 @@
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  *
- * $Id: kern_physio.c,v 1.28 1998/08/19 10:50:32 sos Exp $
+ * $Id: kern_physio.c,v 1.29 1998/10/25 17:44:51 phk Exp $
  */
 
 #include <sys/param.h>
@@ -147,7 +147,7 @@ physio(strategy, bp, dev, rw, minp, uio)
 
 
 doerror:
-	relpbuf(bpa);
+	relpbuf(bpa, NULL);
 	if (!bp_alloc) {
 		bp->b_flags &= ~(B_BUSY|B_PHYS);
 		if( bp->b_flags & B_WANTED) {
@@ -197,13 +197,13 @@ phygetvpbuf(dev_t dev, int resid)
 
 	bdsw = cdevsw[major(dev)];
 	if ((bdsw == NULL) || (bdsw->d_bmaj == -1))
-		return getpbuf();
+		return getpbuf(NULL);
 
 	maxio = bdsw->d_maxio;
 	if (resid > maxio)
 		resid = maxio;
 
-	return getpbuf();
+	return getpbuf(NULL);
 }
 
 static void
diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c
index d637ab4..810b87e 100644
--- a/sys/kern/subr_rlist.c
+++ b/sys/kern/subr_rlist.c
@@ -13,7 +13,7 @@
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This software is a component of "386BSD" developed by
-	William F. Jolitz, TeleMuse.
+ *	William F. Jolitz, TeleMuse.
  * 4. Neither the name of the developer nor the name "386BSD"
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
@@ -54,9 +54,13 @@
  * functioning of this software, nor does the author assume any responsibility
  * for damages incurred with its use.
  *
- *	$Id: subr_rlist.c,v 1.28 1999/01/08 17:31:12 eivind Exp $
+ *	--------- DEPRECIATED ---------
+ *
+ *	$Id: subr_rlist.c,v 1.29 1999/01/10 01:58:25 eivind Exp $
  */
 
+#if 0
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/rlist.h>
@@ -307,3 +311,6 @@ rlist_destroy (rlh)
 		rlist_mfree(lp);
 	}
 }
+
+#endif
+
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index edc74a7..a6c2dfe 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -1,4 +1,4 @@
-/*	$Id: sysv_shm.c,v 1.38 1998/08/24 08:39:38 dfr Exp $ */
+/*	$Id: sysv_shm.c,v 1.39 1998/10/13 08:24:40 dg Exp $ */
 /*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
 
 /*
@@ -52,6 +52,7 @@
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_inherit.h>
 
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 6cc487a..1634681 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
- * $Id: uipc_syscalls.c,v 1.48 1998/12/03 12:35:47 dg Exp $
+ * $Id: uipc_syscalls.c,v 1.49 1998/12/07 21:58:29 archie Exp $
  */
 
 #include "opt_compat.h"
@@ -1543,7 +1543,13 @@ retry_lookup:
 					VM_WAIT;
 					goto retry_lookup;
 				}
-				vm_page_flag_clear(pg, PG_BUSY);
+				/*
+				 * don't just clear PG_BUSY manually -
+				 * vm_page_alloc() should be considered opaque,
+				 * use the VM routine provided to clear
+				 * PG_BUSY.
+				 */
+				vm_page_wakeup(pg);
 			}
 			/*
 			 * Ensure that our page is still around when the I/O completes.
@@ -1583,21 +1589,12 @@ retry_lookup:
 				goto done;
 			}
 		} else {
-			if ((pg->flags & PG_BUSY) || pg->busy)  {
-				s = splvm();
-				if ((pg->flags & PG_BUSY) || pg->busy) {
-					/*
-					 * Page is busy. Wait and retry.
-					 */
-					vm_page_flag_set(pg, PG_WANTED);
-					tsleep(pg, PVM, "sfpbsy", 0);
-					splx(s);
-					goto retry_lookup;
-				}
-				splx(s);
-			}
+			if (vm_page_sleep_busy(pg, TRUE, "sfpbsy"))
+				goto retry_lookup;
+
 			/*
-			 * Protect from having the page ripped out from beneath us.
+			 * Protect from having the page ripped out from 
+			 * beneath us.
 			 */
 			vm_page_wire(pg);
 		}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 93f6164..d528f5e 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
- *	$Id: uipc_usrreq.c,v 1.36 1998/07/15 02:32:12 bde Exp $
+ *	$Id: uipc_usrreq.c,v 1.37 1998/10/25 17:44:51 phk Exp $
  */
 
 #include <sys/param.h>
@@ -1114,8 +1114,11 @@ unp_gc()
 	/* 
 	 * for each FD on our hit list, do the following two things
 	 */
-	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
-		sorflush((struct socket *)(*fpp)->f_data);
+	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
+		struct file *tfp = *fpp;
+		if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
+			sorflush((struct socket *)(tfp->f_data));
+	}
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
 		closef(*fpp, (struct proc *) NULL);
 	free((caddr_t)extra_ref, M_FILE);
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index c7c8aa9..c1af873 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -13,7 +13,7 @@
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  *
- * $Id: vfs_aio.c,v 1.35 1998/11/27 01:14:21 tegge Exp $
+ * $Id: vfs_aio.c,v 1.36 1998/12/15 17:38:33 des Exp $
  */
 
 /*
@@ -386,7 +386,7 @@ aio_free_entry(struct aiocblist *aiocbe)
 		splx(s);
 		if (aiocbe->bp) {
 			vunmapbuf(aiocbe->bp);
-			relpbuf(aiocbe->bp);
+			relpbuf(aiocbe->bp, NULL);
 			aiocbe->bp = NULL;
 		}
 	}
@@ -1035,7 +1035,7 @@ aio_qphysio(p, aiocbe)
 	}
 
 	/* create and build a buffer header for a transfer */
-	bp = (struct buf *)getpbuf();
+	bp = (struct buf *)getpbuf(NULL);
 
 	/*
 	 * get a copy of the kva from the physical buffer
@@ -1122,7 +1122,7 @@ doerror:
 		lj->lioj_buffer_count--;
 	}
 	aiocbe->bp = NULL;
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return error;
 }
 
@@ -1172,7 +1172,7 @@ aio_fphysio(p, iocb, flgwait)
 		error = bp->b_error;
 	}
 
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return (error);
 }
 
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 30018b5..3bb204e 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.192 1999/01/12 11:59:34 eivind Exp $
+ * $Id: vfs_bio.c,v 1.193 1999/01/19 08:00:51 dillon Exp $
  */
 
 /*
@@ -562,7 +562,7 @@ brelse(struct buf * bp)
 	int s;
 
 	if (bp->b_flags & B_CLUSTER) {
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return;
 	}
 
@@ -1364,6 +1364,7 @@ vfs_setdirty(struct buf *bp) {
 				break;
 			}
 		}
+
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 		if (boffset < bp->b_dirtyoff) {
 			bp->b_dirtyoff = max(boffset, 0);
@@ -1412,7 +1413,6 @@ loop:
 
 	if ((bp = gbincore(vp, blkno))) {
 		if (bp->b_flags & B_BUSY) {
-
 			bp->b_flags |= B_WANTED;
 			if (bp->b_usecount < BUF_MAXUSE)
 				++bp->b_usecount;
@@ -1429,16 +1429,13 @@ loop:
 		bremfree(bp);
 
 		/*
-		 * check for size inconsistancies (note that they shouldn't
-		 * happen but do when filesystems don't handle the size changes
-		 * correctly.) We are conservative on metadata and don't just
-		 * extend the buffer but write (if needed) and re-constitute it.
+		 * check for size inconsistancies for non-VMIO case.
 		 */
 
 		if (bp->b_bcount != size) {
-			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
-				allocbuf(bp, size);
-			} else {
+			if ((bp->b_flags & B_VMIO) == 0 ||
+			    (size > bp->b_kvasize)
+			) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					VOP_BWRITE(bp);
@@ -1455,15 +1452,26 @@ loop:
 				goto loop;
 			}
 		}
+
+		/*
+		 * If the size is inconsistant in the VMIO case, we can resize
+		 * the buffer.  This might lead to B_CACHE getting cleared.
+		 */
+
+		if (bp->b_bcount != size)
+			allocbuf(bp, size);
+
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
+
 		/*
 		 * Check that the constituted buffer really deserves for the
 		 * B_CACHE bit to be set.  B_VMIO type buffers might not
 		 * contain fully valid pages.  Normal (old-style) buffers
-		 * should be fully valid.
+		 * should be fully valid.  This might also lead to B_CACHE
+		 * getting clear.
 		 */
-		if (bp->b_flags & B_VMIO) {
+		if ((bp->b_flags & B_VMIO|B_CACHE) == (B_VMIO|B_CACHE)) {
 			int checksize = bp->b_bufsize;
 			int poffset = bp->b_offset & PAGE_MASK;
 			int resid;
@@ -1479,6 +1487,19 @@ loop:
 			}
 		}
 
+		/*
+		 * If B_DELWRI is set and B_CACHE got cleared ( or was
+		 * already clear ), we have to commit the write and
+		 * retry.  The NFS code absolutely depends on this,
+		 * and so might the FFS code.  In anycase, it formalizes
+		 * the B_CACHE rules.  See sys/buf.h.
+		 */
+
+		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			VOP_BWRITE(bp);
+			goto loop;
+		}
+
 		if (bp->b_usecount < BUF_MAXUSE)
 			++bp->b_usecount;
 		splx(s);
@@ -1572,19 +1593,18 @@ geteblk(int size)
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
- * VM object (in the case of VMIO operations).
+ * VM object (in the case of VMIO operations).  This code is able to
+ * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
- * deadlock or inconsistant data situations.  Tread lightly!!!
- *
- * Modify the length of a buffer's underlying buffer storage without
- * destroying information (unless, of course the buffer is shrinking).
+ * deadlock or inconsistant data situations.  Tread lightly!!! 
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
+ * the caller.  Calling this code willy nilly can result in the loss of data.
  */
+
 int
-allocbuf(struct buf * bp, int size)
+allocbuf(struct buf *bp, int size)
 {
-
-	int s;
 	int newbsize, mbsize;
 	int i;
 
@@ -1705,7 +1725,8 @@ allocbuf(struct buf * bp, int size)
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
-					vm_page_sleep(m, "biodep", &m->busy);
+					while (vm_page_sleep_busy(m, TRUE, "biodep"))
+						;
 
 					bp->b_pages[i] = NULL;
 					vm_page_unwire(m, 0);
@@ -1771,16 +1792,25 @@ allocbuf(struct buf * bp, int size)
 						}
 
 						vm_page_wire(m);
-						vm_page_flag_clear(m, PG_BUSY);
+						vm_page_wakeup(m);
 						bp->b_flags &= ~B_CACHE;
 
-					} else if (m->flags & PG_BUSY) {
-						s = splvm();
-						if (m->flags & PG_BUSY) {
-							vm_page_flag_set(m, PG_WANTED);
-							tsleep(m, PVM, "pgtblk", 0);
-						}
-						splx(s);
+					} else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) {
+						/*
+						 *  If we had to sleep, retry.
+						 *
+						 *  Also note that we only test
+						 *  PG_BUSY here, not m->busy.
+						 *  
+						 *  We cannot sleep on m->busy
+						 *  here because a vm_fault ->
+						 *  getpages -> cluster-read ->
+						 *  ...-> allocbuf sequence 
+						 *  will convert PG_BUSY to
+						 *  m->busy so we have to let 
+						 *  m->busy through if we do 
+						 *  not want to deadlock.
+						 */
 						goto doretry;
 					} else {
 						if ((curproc != pageproc) &&
@@ -2010,12 +2040,8 @@ biodone(register struct buf * bp)
 			foff += resid;
 			iosize -= resid;
 		}
-		if (obj &&
-			(obj->paging_in_progress == 0) &&
-		    (obj->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(obj, OBJ_PIPWNT);
-			wakeup(obj);
-		}
+		if (obj)
+			vm_object_pip_wakeupn(obj, 0);
 	}
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
@@ -2096,11 +2122,7 @@ vfs_unbusy_pages(struct buf * bp)
 			vm_page_flag_clear(m, PG_ZERO);
 			vm_page_io_finish(m);
 		}
-		if (obj->paging_in_progress == 0 &&
-		    (obj->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(obj, OBJ_PIPWNT);
-			wakeup(obj);
-		}
+		vm_object_pip_wakeupn(obj, 0);
 	}
 }
 
@@ -2109,6 +2131,8 @@ vfs_unbusy_pages(struct buf * bp)
  * of a page.  If the consumer is not NFS, and the page is not
  * valid for the entire range, clear the B_CACHE flag to force
  * the consumer to re-read the page.
+ *
+ * B_CACHE interaction is especially tricky.
  */
 static void
 vfs_buf_set_valid(struct buf *bp,
@@ -2135,13 +2159,16 @@ vfs_buf_set_valid(struct buf *bp,
 		}
 		evalid = min(evalid, off + size);
 		/*
-		 * Make sure this range is contiguous with the range
-		 * built up from previous pages.  If not, then we will
-		 * just use the range from the previous pages.
+		 * We can only set b_validoff/end if this range is contiguous
+		 * with the range built up already.  If we cannot set
+		 * b_validoff/end, we must clear B_CACHE to force an update
+		 * to clean the bp up.
 		 */
 		if (svalid == bp->b_validend) {
 			bp->b_validoff = min(bp->b_validoff, svalid);
 			bp->b_validend = max(bp->b_validend, evalid);
+		} else {
+			bp->b_flags &= ~B_CACHE;
 		}
 	} else if (!vm_page_is_valid(m,
 				     (vm_offset_t) ((foff + off) & PAGE_MASK),
@@ -2154,6 +2181,10 @@ vfs_buf_set_valid(struct buf *bp,
  * Set the valid bits in a page, taking care of the b_validoff,
  * b_validend fields which NFS uses to optimise small reads.  Off is
  * the offset within the file and pageno is the page index within the buf.
+ *
+ * XXX we have to set the valid & clean bits for all page fragments 
+ * touched by b_validoff/validend, even if the page fragment goes somewhat
+ * beyond b_validoff/validend due to alignment.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
@@ -2208,7 +2239,7 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
 retry:
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
-			if (vm_page_sleep(m, "vbpage", NULL))
+			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 				goto retry;
 		}
 
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index ce842ad..781508e 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.76 1999/01/08 17:31:15 eivind Exp $
+ * $Id: vfs_cluster.c,v 1.77 1999/01/10 01:58:25 eivind Exp $
  */
 
 #include "opt_debug_cluster.h"
@@ -68,6 +68,8 @@ static struct buf *
 
 extern vm_page_t	bogus_page;
 
+extern int cluster_pbuf_freecnt;
+
 /*
  * Maximum number of blocks for read-ahead.
  */
@@ -336,7 +338,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
 		return tbp;
 
-	bp = trypbuf();
+	bp = trypbuf(&cluster_pbuf_freecnt);
 	if (bp == 0)
 		return tbp;
 
@@ -475,7 +477,7 @@ cluster_callback(bp)
 		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
 		biodone(tbp);
 	}
-	relpbuf(bp);
+	relpbuf(bp, &cluster_pbuf_freecnt);
 }
 
 /*
@@ -654,7 +656,7 @@ cluster_wbuild(vp, size, start_lbn, len)
 		  (tbp->b_bcount != tbp->b_bufsize) ||
 		  (tbp->b_bcount != size) ||
 		  (len == 1) ||
-		  ((bp = trypbuf()) == NULL)) {
+		  ((bp = trypbuf(&cluster_pbuf_freecnt)) == NULL)) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 179ef78..44b1698 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.181 1999/01/08 17:31:17 eivind Exp $
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
  */
 
 /*
@@ -63,10 +63,13 @@
 #include <machine/limits.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_zone.h>
@@ -985,6 +988,10 @@ sched_sync(void)
 
 /*
  * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer.  i.e. the bp has not been linked into the vnode or
+ * ref-counted.
  */
 void
 pbgetvp(vp, bp)
@@ -995,6 +1002,7 @@ pbgetvp(vp, bp)
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
+	bp->b_flags |= B_PAGING;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
@@ -1011,7 +1019,34 @@ pbrelvp(bp)
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
+#if !defined(MAX_PERF)
+	/* XXX REMOVE ME */
+	if (bp->b_vnbufs.tqe_next != NULL) {
+		panic(
+		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
+		    bp,
+		    (int)bp->b_flags
+		);
+	}
+#endif
 	bp->b_vp = (struct vnode *) 0;
+	bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+	struct buf *bp;
+	struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_PAGING) == 0) {
+		panic(
+		    "pbreassignbuf() on non phys bp %p", 
+		    bp
+		);
+	}
+#endif
+	bp->b_vp = newvp;
 }
 
 /*
@@ -1034,6 +1069,15 @@ reassignbuf(bp, newvp)
 		return;
 	}
 
+#if !defined(MAX_PERF)
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+#endif
+
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 179ef78..44b1698 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.181 1999/01/08 17:31:17 eivind Exp $
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
  */
 
 /*
@@ -63,10 +63,13 @@
 #include <machine/limits.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_zone.h>
@@ -985,6 +988,10 @@ sched_sync(void)
 
 /*
  * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer.  i.e. the bp has not been linked into the vnode or
+ * ref-counted.
  */
 void
 pbgetvp(vp, bp)
@@ -995,6 +1002,7 @@ pbgetvp(vp, bp)
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
+	bp->b_flags |= B_PAGING;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
@@ -1011,7 +1019,34 @@ pbrelvp(bp)
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
+#if !defined(MAX_PERF)
+	/* XXX REMOVE ME */
+	if (bp->b_vnbufs.tqe_next != NULL) {
+		panic(
+		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
+		    bp,
+		    (int)bp->b_flags
+		);
+	}
+#endif
 	bp->b_vp = (struct vnode *) 0;
+	bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+	struct buf *bp;
+	struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_PAGING) == 0) {
+		panic(
+		    "pbreassignbuf() on non phys bp %p", 
+		    bp
+		);
+	}
+#endif
+	bp->b_vp = newvp;
 }
 
 /*
@@ -1034,6 +1069,15 @@ reassignbuf(bp, newvp)
 		return;
 	}
 
+#if !defined(MAX_PERF)
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+#endif
+
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
diff --git a/sys/miscfs/devfs/devfs_vnops.c b/sys/miscfs/devfs/devfs_vnops.c
index e9bdc2a..56fa842 100644
--- a/sys/miscfs/devfs/devfs_vnops.c
+++ b/sys/miscfs/devfs/devfs_vnops.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- *	$Id: devfs_vnops.c,v 1.64 1998/12/15 23:46:59 eivind Exp $
+ *	$Id: devfs_vnops.c,v 1.65 1999/01/12 11:49:29 eivind Exp $
  */
 
 
@@ -1933,7 +1933,7 @@ devfs_getpages(struct vop_getpages_args *ap)
 		blksiz = DEV_BSIZE;
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
@@ -2042,13 +2042,13 @@ devfs_getpages(struct vop_getpages_args *ap)
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
 
diff --git a/sys/miscfs/procfs/procfs_map.c b/sys/miscfs/procfs/procfs_map.c
index 4dae10a..c6b8966 100644
--- a/sys/miscfs/procfs/procfs_map.c
+++ b/sys/miscfs/procfs/procfs_map.c
@@ -36,7 +36,7 @@
  *
  *	@(#)procfs_status.c	8.3 (Berkeley) 2/17/94
  *
- *	$Id: procfs_map.c,v 1.17 1998/04/29 04:28:22 dyson Exp $
+ *	$Id: procfs_map.c,v 1.18 1998/12/04 22:54:51 archie Exp $
  */
 
 #include <sys/param.h>
@@ -93,7 +93,7 @@ procfs_domap(curp, p, pfs, uio)
 		((uio->uio_resid > 0) && (entry != &map->header));
 		entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
-		int ref_count, shadow_count, id, flags;
+		int ref_count, shadow_count, flags;
 		vm_offset_t addr;
 		int resident, privateresident;
 		char *type;
@@ -139,13 +139,11 @@ case OBJT_DEVICE:
 			flags = obj->flags;
 			ref_count = obj->ref_count;
 			shadow_count = obj->shadow_count;
-			id = obj->id;
 		} else {
 			type = "none";
 			flags = 0;
 			ref_count = 0;
 			shadow_count = 0;
-			id = 0;
 		}
 			
 
@@ -154,9 +152,9 @@ case OBJT_DEVICE:
 		 *  start, end, resident, private resident, cow, access, type.
 		 */
 		snprintf(mebuffer, sizeof(mebuffer),
-		    "0x%x 0x%x %d %d %d %s%s%s %d %d 0x%x %s %s %s\n",
+		    "0x%x 0x%x %d %d %p %s%s%s %d %d 0x%x %s %s %s\n",
 			entry->start, entry->end,
-			resident, privateresident, id,
+			resident, privateresident, obj,
 			(entry->protection & VM_PROT_READ)?"r":"-",
 			(entry->protection & VM_PROT_WRITE)?"w":"-",
 			(entry->protection & VM_PROT_EXECUTE)?"x":"-",
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index ff0f347..6096a1b 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
- * $Id: spec_vnops.c,v 1.77 1998/12/07 21:58:33 archie Exp $
+ * $Id: spec_vnops.c,v 1.78 1998/12/16 00:10:51 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -781,7 +781,7 @@ spec_getpages(ap)
 		blksiz = DEV_BSIZE;
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
@@ -894,13 +894,13 @@ spec_getpages(ap)
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
-		relpbuf(bp);
+		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
-	relpbuf(bp);
+	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
 
diff --git a/sys/net/if_sl.c b/sys/net/if_sl.c
index 99a6978..151df6e 100644
--- a/sys/net/if_sl.c
+++ b/sys/net/if_sl.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)if_sl.c	8.6 (Berkeley) 2/1/94
- * $Id: if_sl.c,v 1.69 1998/06/07 17:12:05 dfr Exp $
+ * $Id: if_sl.c,v 1.70 1998/07/15 02:32:23 bde Exp $
  */
 
 /*
@@ -70,7 +70,9 @@
 
 #include "bpfilter.h"
 #include "opt_inet.h"
-
+#if !defined(ACTUALLY_LKM_NOT_KERNEL) && !defined(KLD_MODULE)
+#include "opt_slip.h"
+#endif
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
@@ -217,7 +219,11 @@ slattach(dummy)
 		sc->sc_if.if_unit = i++;
 		sc->sc_if.if_mtu = SLMTU;
 		sc->sc_if.if_flags =
-		    IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST;
+#ifdef SLIP_IFF_OPTS
+		    SLIP_IFF_OPTS;
+#else
+		    IFF_BROADCAST | IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST;
+#endif
 		sc->sc_if.if_type = IFT_SLIP;
 		sc->sc_if.if_ioctl = slioctl;
 		sc->sc_if.if_output = sloutput;
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index c973700..fb437a5 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.64 1998/12/07 21:58:43 archie Exp $
+ * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $
  */
 
 
@@ -68,6 +68,7 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
+extern int nfs_pbuf_freecnt;
 extern struct nfsstats nfsstats;
 
 /*
@@ -113,7 +114,7 @@ nfs_getpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
@@ -132,10 +133,16 @@ nfs_getpages(ap)
 	error = nfs_readrpc(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
-	if (error && (uio.uio_resid == count))
+	if (error && (uio.uio_resid == count)) {
+		printf("nfs_getpages: error %d\n", error);
+		for (i = 0; i < npages; ++i) {
+			if (i != ap->a_reqpage)
+				vnode_pager_freepage(pages[i]);
+		}
 		return VM_PAGER_ERROR;
+	}
 
 	size = count - uio.uio_resid;
 
@@ -228,7 +235,7 @@ nfs_putpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
@@ -251,7 +258,7 @@ nfs_putpages(ap)
 	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
 
 	pmap_qremove(kva, npages);
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
@@ -439,6 +446,7 @@ again:
 		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
+
 		/*
 		 * If we are being called from nfs_getpages, we must
 		 * make sure the buffer is a vmio buffer.  The vp will
@@ -779,6 +787,7 @@ again:
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 */
+
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			bp->b_proc = p;
@@ -1254,17 +1263,24 @@ nfs_doio(bp, cr, p)
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
+		 *
+		 * If the buffer is marked B_PAGING, it does not reside on
+		 * the vp's paging queues so we do not ( and cannot ) reassign
+		 * it.  XXX numdirtybuffers should be integrated into 
+		 * reassignbuf() call.
 		 */
     		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
-			++numdirtybuffers;
-			bp->b_flags |= B_DELWRI;
-			s = splbio();
-			reassignbuf(bp, vp);
-			splx(s);
+			if ((bp->b_flags & B_PAGING) == 0) {
+			    ++numdirtybuffers;
+			    bp->b_flags |= B_DELWRI;
+			    s = splbio();
+			    reassignbuf(bp, vp);
+			    splx(s);
+			}
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 	    	} else {
diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfs/nfs_common.c
+++ b/sys/nfs/nfs_common.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfs/nfs_subs.c
+++ b/sys/nfs/nfs_subs.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index c97267a..4131b60 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.115 1998/12/25 10:34:27 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.116 1999/01/12 12:39:14 eivind Exp $
  */
 
 
@@ -2627,14 +2627,17 @@ nfs_strategy(ap)
 
 	if (bp->b_flags & B_PHYS)
 		panic("nfs physio");
+
 	if (bp->b_flags & B_ASYNC)
 		p = (struct proc *)0;
 	else
 		p = curproc;	/* XXX */
+
 	if (bp->b_flags & B_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
+
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index c973700..fb437a5 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.64 1998/12/07 21:58:43 archie Exp $
+ * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $
  */
 
 
@@ -68,6 +68,7 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 static void nfs_prot_buf __P((struct buf *bp, int off, int n));
 
 extern int nfs_numasync;
+extern int nfs_pbuf_freecnt;
 extern struct nfsstats nfsstats;
 
 /*
@@ -113,7 +114,7 @@ nfs_getpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	npages = btoc(count);
 	kva = (vm_offset_t) bp->b_data;
@@ -132,10 +133,16 @@ nfs_getpages(ap)
 	error = nfs_readrpc(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
-	if (error && (uio.uio_resid == count))
+	if (error && (uio.uio_resid == count)) {
+		printf("nfs_getpages: error %d\n", error);
+		for (i = 0; i < npages; ++i) {
+			if (i != ap->a_reqpage)
+				vnode_pager_freepage(pages[i]);
+		}
 		return VM_PAGER_ERROR;
+	}
 
 	size = count - uio.uio_resid;
 
@@ -228,7 +235,7 @@ nfs_putpages(ap)
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
-	bp = getpbuf();
+	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
@@ -251,7 +258,7 @@ nfs_putpages(ap)
 	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
 
 	pmap_qremove(kva, npages);
-	relpbuf(bp);
+	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
@@ -439,6 +446,7 @@ again:
 		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
 		if (!bp)
 			return (EINTR);
+
 		/*
 		 * If we are being called from nfs_getpages, we must
 		 * make sure the buffer is a vmio buffer.  The vp will
@@ -779,6 +787,7 @@ again:
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 */
+
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			bp->b_proc = p;
@@ -1254,17 +1263,24 @@ nfs_doio(bp, cr, p)
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
+		 *
+		 * If the buffer is marked B_PAGING, it does not reside on
+		 * the vp's paging queues so we do not ( and cannot ) reassign
+		 * it.  XXX numdirtybuffers should be integrated into 
+		 * reassignbuf() call.
 		 */
     		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
-			++numdirtybuffers;
-			bp->b_flags |= B_DELWRI;
-			s = splbio();
-			reassignbuf(bp, vp);
-			splx(s);
+			if ((bp->b_flags & B_PAGING) == 0) {
+			    ++numdirtybuffers;
+			    bp->b_flags |= B_DELWRI;
+			    s = splbio();
+			    reassignbuf(bp, vp);
+			    splx(s);
+			}
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 	    	} else {
diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfsclient/nfs_subs.c
+++ b/sys/nfsclient/nfs_subs.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index c97267a..4131b60 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.115 1998/12/25 10:34:27 dfr Exp $
+ * $Id: nfs_vnops.c,v 1.116 1999/01/12 12:39:14 eivind Exp $
  */
 
 
@@ -2627,14 +2627,17 @@ nfs_strategy(ap)
 
 	if (bp->b_flags & B_PHYS)
 		panic("nfs physio");
+
 	if (bp->b_flags & B_ASYNC)
 		p = (struct proc *)0;
 	else
 		p = curproc;	/* XXX */
+
 	if (bp->b_flags & B_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
+
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c
index b3eec24..6c9cfb7 100644
--- a/sys/nfsserver/nfs_srvsubs.c
+++ b/sys/nfsserver/nfs_srvsubs.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
- * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $
+ * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $
  */
 
 /*
@@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= {
 };
 
 int nfs_ticks;
+int nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
@@ -1191,6 +1192,8 @@ nfs_init(vfsp)
 	sysent[SYS_getfh].sy_call = (sy_call_t *)getfh;
 #endif
 
+	nfs_pbuf_freecnt = nswbuf / 2 + 1;
+
 	return (0);
 }
 
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index 191fdbc..f2b0f4b 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.60 1998/10/31 14:05:11 peter Exp $
+ * $Id: buf.h,v 1.61 1998/11/13 01:01:44 dg Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -116,7 +116,10 @@ struct buf {
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
-	void	*b_spc;
+	union	pager_info {
+		void	*pg_spc;
+		int	pg_reqpage;
+	} b_pager;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
@@ -126,9 +129,29 @@ struct buf {
 	struct	workhead b_dep;		/* List of filesystem dependencies. */
 };
 
+#define b_spc	b_pager.pg_spc
+
 /*
  * These flags are kept in b_flags.
+ *
+ * Notes:
+ *
+ *	B_ASYNC		VOP calls on bp's are usually async whether or not
+ *			B_ASYNC is set, but some subsystems, such as NFS, like 
+ *			to know what is best for the caller so they can
+ *			optimize the I/O.
+ *
+ *	B_PAGING	Indicates that bp is being used by the paging system or
+ *			some paging system and that the bp is not linked into
+ *			the b_vp's clean/dirty linked lists or ref counts.
+ *			Buffer vp reassignments are illegal in this case.
+ *
+ *	B_CACHE		This may only be set if the buffer is entirely valid.
+ *			The situation where B_DELWRI is set and B_CACHE gets
+ *			cleared MUST be committed to disk so B_DELWRI can
+ *			also be cleared.
  */
+
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
@@ -312,13 +335,12 @@ int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
-struct buf *     getpbuf __P((void));
+struct buf *     getpbuf __P((int *));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
-int	allocbuf __P((struct buf *, int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
@@ -336,13 +358,15 @@ void	vfs_unbusy_pages __P((struct buf *));
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
-void	relpbuf __P((struct buf *));
+void	relpbuf __P((struct buf *, int *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
+int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
-struct	buf *trypbuf __P((void));
+void	bpreassignbuf __P((struct buf *, struct vnode *));
+struct	buf *trypbuf __P((int *));
 void	vfs_bio_need_satisfy __P((void));
 #endif /* KERNEL */
 
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 191fdbc..f2b0f4b 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.60 1998/10/31 14:05:11 peter Exp $
+ * $Id: buf.h,v 1.61 1998/11/13 01:01:44 dg Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -116,7 +116,10 @@ struct buf {
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
-	void	*b_spc;
+	union	pager_info {
+		void	*pg_spc;
+		int	pg_reqpage;
+	} b_pager;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
@@ -126,9 +129,29 @@ struct buf {
 	struct	workhead b_dep;		/* List of filesystem dependencies. */
 };
 
+#define b_spc	b_pager.pg_spc
+
 /*
  * These flags are kept in b_flags.
+ *
+ * Notes:
+ *
+ *	B_ASYNC		VOP calls on bp's are usually async whether or not
+ *			B_ASYNC is set, but some subsystems, such as NFS, like 
+ *			to know what is best for the caller so they can
+ *			optimize the I/O.
+ *
+ *	B_PAGING	Indicates that bp is being used by the paging system or
+ *			some paging system and that the bp is not linked into
+ *			the b_vp's clean/dirty linked lists or ref counts.
+ *			Buffer vp reassignments are illegal in this case.
+ *
+ *	B_CACHE		This may only be set if the buffer is entirely valid.
+ *			The situation where B_DELWRI is set and B_CACHE gets
+ *			cleared MUST be committed to disk so B_DELWRI can
+ *			also be cleared.
  */
+
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
@@ -312,13 +335,12 @@ int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
-struct buf *     getpbuf __P((void));
+struct buf *     getpbuf __P((int *));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
-int	allocbuf __P((struct buf *, int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
@@ -336,13 +358,15 @@ void	vfs_unbusy_pages __P((struct buf *));
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
-void	relpbuf __P((struct buf *));
+void	relpbuf __P((struct buf *, int *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
+int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
-struct	buf *trypbuf __P((void));
+void	bpreassignbuf __P((struct buf *, struct vnode *));
+struct	buf *trypbuf __P((int *));
 void	vfs_bio_need_satisfy __P((void));
 #endif /* KERNEL */
 
diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h
index d8e0cd8..87949b8 100644
--- a/sys/sys/malloc.h
+++ b/sys/sys/malloc.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)malloc.h	8.5 (Berkeley) 5/3/95
- * $Id: malloc.h,v 1.37 1998/03/08 09:58:26 julian Exp $
+ * $Id: malloc.h,v 1.38 1998/11/10 08:46:24 peter Exp $
  */
 
 #ifndef _SYS_MALLOC_H_
@@ -42,11 +42,13 @@
 #define KMEMSTATS
 
 /*
- * flags to malloc
+ * flags to malloc.
  */
+
 #define	M_WAITOK	0x0000
-#define	M_NOWAIT	0x0001
-#define M_KERNEL	0x0002
+#define	M_NOWAIT	0x0001		/* do not block			*/
+#define M_USE_RESERVE	0x0002		/* can alloc out of reserve memory  */
+#define M_ASLEEP	0x0004		/* async sleep on failure	*/
 
 #define	M_MAGIC		877983977	/* time when first defined :-) */
 
diff --git a/sys/sys/param.h b/sys/sys/param.h
index badddca..fb15db3 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
- * $Id: param.h,v 1.37 1998/10/16 04:28:04 jkh Exp $
+ * $Id: param.h,v 1.38 1998/10/16 06:55:07 jkh Exp $
  */
 
 #ifndef _SYS_PARAM_H_
@@ -227,4 +227,10 @@
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
+#define dbtoc(db)			/* calculates devblks to pages */ \
+	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
+ 
+#define ctodb(db)			/* calculates pages to devblks */ \
+	((db) << (PAGE_SHIFT - DEV_BSHIFT))
+
 #endif	/* _SYS_PARAM_H_ */
diff --git a/sys/sys/types.h b/sys/sys/types.h
index 93f8698..c65fe67 100644
--- a/sys/sys/types.h
+++ b/sys/sys/types.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)types.h	8.6 (Berkeley) 2/19/95
- * $Id: types.h,v 1.25 1998/06/07 17:13:05 dfr Exp $
+ * $Id: types.h,v 1.26 1998/12/19 00:02:34 dt Exp $
  */
 
 #ifndef _SYS_TYPES_H_
@@ -68,6 +68,7 @@ typedef	quad_t *	qaddr_t;
 
 typedef	char *		caddr_t;	/* core address */
 typedef	int32_t		daddr_t;	/* disk address */
+typedef	u_int32_t	u_daddr_t;	/* unsigned disk address */
 typedef	u_int32_t	dev_t;		/* device number */
 typedef	u_int32_t	fixpt_t;	/* fixed point number */
 typedef	u_int32_t	gid_t;		/* group id */
diff --git a/sys/ufs/mfs/mfs_extern.h b/sys/ufs/mfs/mfs_extern.h
index ca19cc4..ae5b7af 100644
--- a/sys/ufs/mfs/mfs_extern.h
+++ b/sys/ufs/mfs/mfs_extern.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_extern.h	8.4 (Berkeley) 3/30/95
- * $Id: mfs_extern.h,v 1.10 1997/10/16 10:50:00 phk Exp $
+ * $Id: mfs_extern.h,v 1.11 1998/02/03 21:52:02 bde Exp $
  */
 
 #ifndef _UFS_MFS_MFS_EXTERN_H_
@@ -41,8 +41,9 @@ struct buf;
 struct mount;
 struct proc;
 struct vnode;
+struct mfsnode;
 
-void	mfs_doio __P((struct buf *bp, caddr_t base));
+void	mfs_doio __P((struct buf *bp, struct mfsnode *mfsnode));
 int	mfs_mountfs __P((struct vnode *, struct mount *, struct proc *));
 int	mfs_mountroot __P((void));
 
diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c
index 1ea0804..73ab75a 100644
--- a/sys/ufs/mfs/mfs_vfsops.c
+++ b/sys/ufs/mfs/mfs_vfsops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_vfsops.c	8.11 (Berkeley) 6/19/95
- * $Id: mfs_vfsops.c,v 1.52 1998/12/07 21:58:49 archie Exp $
+ * $Id: mfs_vfsops.c,v 1.53 1999/01/01 04:14:11 dillon Exp $
  */
 
 
@@ -64,8 +64,10 @@ MALLOC_DEFINE(M_MFSNODE, "MFS node", "MFS vnode private part");
 
 u_char *	mfs_getimage __P((void));
 
+#ifdef MFS_ROOT
 static caddr_t	mfs_rootbase;	/* address of mini-root in kernel virtual memory */
 static u_long	mfs_rootsize;	/* size of mini-root in bytes */
+#endif
 
 static	int mfs_minor;	/* used for building internal dev_t */
 
@@ -178,7 +180,9 @@ mfs_mount(mp, path, data, ndp, p)
 	struct mfs_args args;
 	struct ufsmount *ump;
 	struct fs *fs;
+#ifdef MFS_ROOT
 	u_char *base;
+#endif
 	struct mfsnode *mfsp;
 	u_int size;
 	int flags, err;
@@ -344,7 +348,9 @@ mfs_mount(mp, path, data, ndp, p)
 		goto error_2;
 	}
 
+#ifdef MFS_ROOT
 dostatfs:
+#endif
 	/*
 	 * Initialize FS stat information in mount struct; uses both
 	 * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname
@@ -387,11 +393,8 @@ mfs_start(mp, flags, p)
 	register struct vnode *vp = VFSTOUFS(mp)->um_devvp;
 	register struct mfsnode *mfsp = VTOMFS(vp);
 	register struct buf *bp;
-	register caddr_t base;
 	register int gotsig = 0;
 
-	base = mfsp->mfs_baseoff;
-
 	/*
 	 * Must set P_SYSTEM to prevent system from trying to kill
 	 * this process.  What happens is that the process is unkillable,
@@ -402,11 +405,20 @@ mfs_start(mp, flags, p)
 	curproc->p_flag |= P_SYSTEM;
 
 	while (mfsp->mfs_active) {
+		int s;
+
+		s = splbio();
+
 		while (bp = bufq_first(&mfsp->buf_queue)) {
 			bufq_remove(&mfsp->buf_queue, bp);
-			mfs_doio(bp, base);
+			splx(s);
+			mfs_doio(bp, mfsp);
 			wakeup((caddr_t)bp);
+			s = splbio();
 		}
+
+		splx(s);
+
 		/*
 		 * If a non-ignored signal is received, try to unmount.
 		 * If that fails, clear the signal (it has been "processed"),
diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c
index 88cfec6..083843c 100644
--- a/sys/ufs/mfs/mfs_vnops.c
+++ b/sys/ufs/mfs/mfs_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mfs_vnops.c	8.11 (Berkeley) 5/22/95
- * $Id: mfs_vnops.c,v 1.37 1998/07/11 07:46:05 bde Exp $
+ * $Id: mfs_vnops.c,v 1.38 1998/09/07 06:52:01 phk Exp $
  */
 
 #include <sys/param.h>
@@ -41,6 +41,8 @@
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/mman.h>
 
 #include <miscfs/specfs/specdev.h>
 
@@ -51,6 +53,7 @@ static int	mfs_badop __P((struct vop_generic_args *));
 static int	mfs_bmap __P((struct vop_bmap_args *));
 static int	mfs_close __P((struct vop_close_args *));
 static int	mfs_fsync __P((struct vop_fsync_args *));
+static int	mfs_freeblks __P((struct vop_freeblks_args *));
 static int	mfs_inactive __P((struct vop_inactive_args *)); /* XXX */
 static int	mfs_open __P((struct vop_open_args *));
 static int	mfs_reclaim __P((struct vop_reclaim_args *)); /* XXX */
@@ -66,7 +69,7 @@ static struct vnodeopv_entry_desc mfs_vnodeop_entries[] = {
 	{ &vop_bmap_desc,		(vop_t *) mfs_bmap },
 	{ &vop_bwrite_desc,		(vop_t *) vop_defaultop },
 	{ &vop_close_desc,		(vop_t *) mfs_close },
-	{ &vop_freeblks_desc,		(vop_t *) vop_defaultop },
+	{ &vop_freeblks_desc,		(vop_t *) mfs_freeblks },
 	{ &vop_fsync_desc,		(vop_t *) mfs_fsync },
 	{ &vop_getpages_desc,		(vop_t *) mfs_getpages },
 	{ &vop_inactive_desc,		(vop_t *) mfs_inactive },
@@ -119,6 +122,38 @@ mfs_fsync(ap)
 }
 
 /*
+ * mfs_freeblks() - hook to allow us to free physical memory.
+ *
+ *	We implement the B_FREEBUF strategy.  We can't just madvise()
+ *	here because we have to do it in the correct order vs other bio
+ *	requests, so we queue it.
+ */
+
+static int
+mfs_freeblks(ap)
+        struct vop_freeblks_args /* {   
+                struct vnode *a_vp;     
+                daddr_t a_addr;         
+                daddr_t a_length;       
+        } */ *ap;
+{       
+	struct buf *bp;
+	struct vnode *vp;
+
+	if (!vfinddev(ap->a_vp->v_rdev, VBLK, &vp) || vp->v_usecount == 0)
+		panic("mfs_strategy: bad dev");
+
+	bp = geteblk(ap->a_length);
+	bp->b_flags |= B_FREEBUF | B_BUSY;
+	bp->b_dev = ap->a_vp->v_rdev;
+	bp->b_blkno = ap->a_addr;
+	bp->b_offset = dbtob(ap->a_addr);
+	bp->b_bcount = ap->a_length;
+	VOP_STRATEGY(vp, bp);
+	return(0);
+}
+
+/*
  * Pass I/O requests to the memory filesystem process.
  */
 static int
@@ -132,26 +167,50 @@ mfs_strategy(ap)
 	register struct mfsnode *mfsp;
 	struct vnode *vp;
 	struct proc *p = curproc;		/* XXX */
+	int s;
 
 	if (!vfinddev(bp->b_dev, VBLK, &vp) || vp->v_usecount == 0)
 		panic("mfs_strategy: bad dev");
 	mfsp = VTOMFS(vp);
-	/* check for mini-root access */
+
+	/*
+	 * splbio required for queueing/dequeueing, in case of forwarded
+	 * BPs from bio interrupts (??).  It may not be necessary.
+	 */
+
+	s = splbio();
+
 	if (mfsp->mfs_pid == 0) {
+		/*
+		 * mini-root.  Note: B_FREEBUF not supported at the moment,
+		 * I'm not sure what kind of dataspace b_data is in.
+		 */
 		caddr_t base;
 
 		base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT);
+		if (bp->b_flags & B_FREEBUF)
+			;
 		if (bp->b_flags & B_READ)
 			bcopy(base, bp->b_data, bp->b_bcount);
 		else
 			bcopy(bp->b_data, base, bp->b_bcount);
 		biodone(bp);
 	} else if (mfsp->mfs_pid == p->p_pid) {
-		mfs_doio(bp, mfsp->mfs_baseoff);
+		/*
+		 * VOP to self
+		 */
+		splx(s);
+		mfs_doio(bp, mfsp);
+		s = splbio();
 	} else {
+		/*
+		 * VOP from some other process, queue to MFS process and
+		 * wake it up.
+		 */
 		bufq_insert_tail(&mfsp->buf_queue, bp);
 		wakeup((caddr_t)vp);
 	}
+	splx(s);
 	return (0);
 }
 
@@ -159,18 +218,59 @@ mfs_strategy(ap)
  * Memory file system I/O.
  *
  * Trivial on the HP since buffer has already been mapping into KVA space.
+ *
+ * Read and Write are handled with a simple copyin and copyout.    
+ *
+ * We also partially support VOP_FREEBLKS() via B_FREEBUF.  We can't implement
+ * completely -- for example, on fragments or inode metadata, but we can
+ * implement it for page-aligned requests.
  */
 void
-mfs_doio(bp, base)
+mfs_doio(bp, mfsp)
 	register struct buf *bp;
-	caddr_t base;
+	struct mfsnode *mfsp;
 {
+	caddr_t base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT);
+
+	if (bp->b_flags & B_FREEBUF) {
+		/*
+		 * Implement B_FREEBUF, which allows the filesystem to tell
+		 * a block device when blocks are no longer needed (like when
+		 * a file is deleted).  We use the hook to MADV_FREE the VM.
+		 * This makes an MFS filesystem work as well or better then
+		 * a sun-style swap-mounted filesystem.
+		 */
+		int bytes = bp->b_bcount;
+
+		if ((vm_offset_t)base & PAGE_MASK) {
+			int n = PAGE_SIZE - ((vm_offset_t)base & PAGE_MASK);
+			bytes -= n;
+			base += n;
+		}
+                if (bytes > 0) {
+                        struct madvise_args uap;
 
-	base += (bp->b_blkno << DEV_BSHIFT);
-	if (bp->b_flags & B_READ)
+			bytes &= ~PAGE_MASK;
+			if (bytes != 0) {
+				bzero(&uap, sizeof(uap));
+				uap.addr  = base;
+				uap.len   = bytes;
+				uap.behav = MADV_FREE;
+				madvise(curproc, &uap);
+			}
+                }
+		bp->b_error = 0;
+	} else if (bp->b_flags & B_READ) {
+		/*
+		 * Read data from our 'memory' disk
+		 */
 		bp->b_error = copyin(base, bp->b_data, bp->b_bcount);
-	else
+	} else {
+		/*
+		 * Write data to our 'memory' disk
+		 */
 		bp->b_error = copyout(bp->b_data, base, bp->b_bcount);
+	}
 	if (bp->b_error)
 		bp->b_flags |= B_ERROR;
 	biodone(bp);
@@ -222,7 +322,7 @@ mfs_close(ap)
 	 */
 	while (bp = bufq_first(&mfsp->buf_queue)) {
 		bufq_remove(&mfsp->buf_queue, bp);
-		mfs_doio(bp, mfsp->mfs_baseoff);
+		mfs_doio(bp, mfsp);
 		wakeup((caddr_t)bp);
 	}
 	/*
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index 026d3486..fd3555a 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
- * $Id: ufs_readwrite.c,v 1.54 1998/12/15 03:29:52 julian Exp $
+ * $Id: ufs_readwrite.c,v 1.55 1999/01/07 16:14:19 bde Exp $
  */
 
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
@@ -392,7 +392,10 @@ WRITE(ap)
 			panic("%s: nonsync dir write", WRITE_S);
 		break;
 	default:
-		panic("%s: type", WRITE_S);
+		panic("%s: type %p %d (%d,%d)", WRITE_S, vp, (int)vp->v_type,
+			(int)uio->uio_offset,
+			(int)uio->uio_resid
+		);
 	}
 
 	fs = ip->I_FS;
@@ -598,9 +601,8 @@ ffs_getpages(ap)
 				vm_page_busy(m);
 				vm_page_free(m);
 			} else if (m == mreq) {
-				while (m->flags & PG_BUSY) {
-					vm_page_sleep(m, "ffspwt", NULL);
-				}
+				while (vm_page_sleep_busy(m, FALSE, "ffspwt"))
+					;
 				vm_page_busy(m);
 				vp->v_lastr = m->pindex + 1;
 			} else {
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 1010085..49e1a29 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.27 (Berkeley) 5/27/95
- * $Id: ufs_vnops.c,v 1.103 1998/12/24 09:45:10 bde Exp $
+ * $Id: ufs_vnops.c,v 1.104 1999/01/07 16:14:19 bde Exp $
  */
 
 #include "opt_quota.h"
@@ -1731,6 +1731,9 @@ ufs_abortop(ap)
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
+ *
+ * In order to be able to swap to a file, the VOP_BMAP operation may not
+ * deadlock on memory.  See ufs_bmap() for details.
  */
 int
 ufs_strategy(ap)
diff --git a/sys/vm/default_pager.c b/sys/vm/default_pager.c
index ba92894..16b7512 100644
--- a/sys/vm/default_pager.c
+++ b/sys/vm/default_pager.c
@@ -28,7 +28,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: default_pager.c,v 1.15 1998/02/06 12:14:20 eivind Exp $
+ * The default pager is responsible for supplying backing store to unbacked
+ * storage.  The backing store is usually swap so we just fall through to
+ * the swap routines.  However, since swap metadata has not been assigned,
+ * the swap routines assign and manage the swap backing store through the
+ * vm_page->swapblk field.  The object is only converted when the page is 
+ * physically freed after having been cleaned and even then vm_page->swapblk
+ * is maintained whenever a resident page also has swap backing store.
+ *
+ *	$Id: default_pager.c,v 1.16 1998/10/13 08:24:42 dg Exp $
  */
 
 #include <sys/param.h>
@@ -78,6 +86,14 @@ default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 	return vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(round_page(offset + size)));
 }
 
+/*
+ * deallocate resources associated with default objects.   The default objects
+ * have no special resources allocated to them, but the vm_page's being used
+ * in this object might.  Still, we do not have to do anything - we will free
+ * the swapblk in the underlying vm_page's when we free the vm_page or
+ * garbage collect the vm_page cache list.
+ */
+
 static void
 default_pager_dealloc(object)
 	vm_object_t object;
@@ -88,9 +104,11 @@ default_pager_dealloc(object)
 }
 
 /*
- * The default pager has no backing store, so we always return
- * failure.
+ * Load pages from backing store.  Since OBJT_DEFAULT is converted to
+ * OBJT_SWAP at the time a swap-backed vm_page_t is freed, we will never
+ * see a vm_page with assigned swap here.
  */
+
 static int
 default_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
@@ -101,6 +119,13 @@ default_pager_getpages(object, m, count, reqpage)
 	return VM_PAGER_FAIL;
 }
 
+/*
+ * Store pages to backing store.  We should assign swap and initiate
+ * I/O.  We do not actually convert the object to OBJT_SWAP here.  The
+ * object will be converted when the written-out vm_page_t is moved from the
+ * cache to the free list.
+ */
+
 static int
 default_pager_putpages(object, m, c, sync, rtvals)
 	vm_object_t object;
@@ -109,26 +134,22 @@ default_pager_putpages(object, m, c, sync, rtvals)
 	boolean_t sync;
 	int *rtvals;
 {
-	int i;
-
-	/*
-	 * Try to convert the object type into a OBJT_SWAP.
-	 * If the swp structure allocation fails, convert it
-	 * back to OBJT_DEFAULT and return failure. Otherwise
-	 * pass this putpages to the swap pager.
-	 */
-	object->type = OBJT_SWAP;
-
-	if (swap_pager_swp_alloc(object, M_KERNEL) != 0) {
-		object->type = OBJT_DEFAULT;
-		for (i = 0; i < c; i++)
-			rtvals[i] = VM_PAGER_FAIL;
-		return VM_PAGER_FAIL;
-	}
-
 	return swap_pager_putpages(object, m, c, sync, rtvals);
 }
 
+/*
+ * Tell us whether the backing store for the requested (object,index) is
+ * synchronized.  i.e. tell us whether we can throw the page away and 
+ * reload it later.  So, for example, if we are in the process of writing
+ * the page to its backing store, or if no backing store has been assigned,
+ * it is not yet synchronized.
+ *
+ * It is possible to have fully-synchronized swap assigned without the
+ * object having been converted.  We just call swap_pager_haspage() to
+ * deal with it since it must already deal with it plus deal with swap
+ * meta-data structures.
+ */
+
 static boolean_t
 default_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
@@ -139,24 +160,3 @@ default_pager_haspage(object, pindex, before, after)
 	return FALSE;
 }
 
-void
-default_pager_convert_to_swap(object)
-	vm_object_t object;
-{
-	object->type = OBJT_SWAP;
-	if (swap_pager_swp_alloc(object, M_KERNEL) != 0) {
-		object->type = OBJT_DEFAULT;
-	}
-}
-
-void
-default_pager_convert_to_swapq(object)
-	vm_object_t object;
-{
-	if (object &&
-		(object->type == OBJT_DEFAULT) &&
-		(object != kernel_object && object != kmem_object) &&
-		(object->size > ((cnt.v_page_count - cnt.v_wire_count) / 4)))
-		default_pager_convert_to_swap(object);
-}
-
diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c
index a200b9c..cc742b0 100644
--- a/sys/vm/device_pager.c
+++ b/sys/vm/device_pager.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)device_pager.c	8.1 (Berkeley) 6/11/93
- * $Id: device_pager.c,v 1.36 1998/12/07 21:58:50 archie Exp $
+ * $Id: device_pager.c,v 1.37 1999/01/08 17:31:23 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -200,7 +200,7 @@ dev_pager_getpages(object, m, count, reqpage)
 	int prot;
 
 	dev = (dev_t) (uintptr_t) object->handle;
-	offset = m[reqpage]->pindex + OFF_TO_IDX(object->paging_offset);
+	offset = m[reqpage]->pindex;
 	prot = PROT_READ;	/* XXX should pass in? */
 	mapfunc = cdevsw[major(dev)]->d_mmap;
 
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 1691168..b063520 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
@@ -36,17 +37,34 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
+ *				New Swap System
+ *				Matthew Dillon
+ *
+ * Radix Bitmap 'blists'.
+ *
+ *	- The new swapper uses the new radix bitmap code.  This should scale
+ *	  to arbitrarily small or arbitrarily large swap spaces and an almost
+ *	  arbitrary degree of fragmentation.
+ *
+ * Features:
+ *
+ *	- on the fly reallocation of swap during putpages.  The new system
+ *	  does not try to keep previously allocated swap blocks for dirty
+ *	  pages.  
+ *
+ *	- on the fly deallocation of swap
+ *
+ *	- No more garbage collection required.  Unnecessarily allocated swap
+ *	  blocks only exist for dirty vm_page_t's now and these are already
+ *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
+ *	  removal of invalidated swap blocks when a page is destroyed
+ *	  or renamed.
+ *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
- * $Id: swap_pager.c,v 1.106 1999/01/08 17:31:23 eivind Exp $
- */
-
-/*
- * Quick hack to page to dedicated partition(s).
- * TODO:
- *	Add multiprocessor locks
- *	Deal with async writes in a better fashion
+ *
+ * $Id: swap_pager.c,v 1.107 1999/01/10 01:58:28 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -57,18 +75,16 @@
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/vmmeter.h>
-#include <sys/rlist.h>
+#include <sys/blist.h>
+#include <sys/lock.h>
 
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
 #endif
 
-#ifndef NPENDINGIO
-#define NPENDINGIO	16
-#endif
-
-#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
+#define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 
+#include "opt_swap.h"
 #include <vm/vm.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
@@ -77,848 +93,651 @@
 #include <vm/vm_pageout.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
 
-static int nswiodone;
-int swap_pager_full;
-extern int vm_swap_size;
-static int no_swap_space = 1;
-static int max_pageout_cluster;
-struct rlisthdr swaplist;
-
-TAILQ_HEAD(swpclean, swpagerclean);
-
-typedef struct swpagerclean *swp_clean_t;
+#define SWM_FREE	0x02	/* free, period			*/
+#define SWM_POP		0x04	/* pop out			*/
 
-static struct swpagerclean {
-	TAILQ_ENTRY(swpagerclean) spc_list;
-	int spc_flags;
-	struct buf *spc_bp;
-	vm_object_t spc_object;
-	vm_offset_t spc_kva;
-	int spc_first;
-	int spc_count;
-	vm_page_t spc_m[MAX_PAGEOUT_CLUSTER];
-} swcleanlist[NPENDINGIO];
-
-
-/* spc_flags values */
-#define SPC_ERROR	0x01
+/*
+ * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
+ * in the old system.
+ */
 
-#define SWB_EMPTY (-1)
+extern int vm_swap_size;	/* number of free swap blocks, in pages */
 
-/* list of completed page cleans */
-static struct swpclean swap_pager_done;
+int swap_pager_full;		/* swap space exhaustion (w/ hysteresis)*/
+static int nsw_rcount;		/* free read buffers			*/
+static int nsw_wcount;		/* free write buffers			*/
+static int nsw_hysteresis;	/* hysteresis				*/
+static int max_pageout_cluster;	/* maximum VOP I/O allowed		*/
+static int sw_alloc_interlock;	/* swap pager allocation interlock	*/
 
-/* list of pending page cleans */
-static struct swpclean swap_pager_inuse;
+struct blist *swapblist;
+static struct swblock **swhash;
+static int swhash_mask;
 
-/* list of free pager clean structs */
-static struct swpclean swap_pager_free;
-static int swap_pager_free_count;
-static int swap_pager_free_pending;
 
-/* list of "named" anon region objects */
-static struct pagerlst swap_pager_object_list;
+/*
+ * "named" and "unnamed" anon region objects.  Try to reduce the overhead
+ * of searching a named list by hashing it just a little.
+ */
 
-/* list of "unnamed" anon region objects */
-struct pagerlst swap_pager_un_object_list;
+#define NOBJLISTS		8
 
-#define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
-#define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2
-static int swap_pager_needflags;
+#define NOBJLIST(handle)	\
+	(&swap_pager_object_list[((int)(long)handle >> 4) & (NOBJLISTS-1)])
 
-static struct pagerlst *swp_qs[] = {
-	&swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0
-};
+static struct pagerlst	swap_pager_object_list[NOBJLISTS];
+struct pagerlst		swap_pager_un_object_list;
+vm_zone_t		swap_zone;
 
 /*
- * pagerops for OBJT_SWAP - "swap pager".
+ * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
+ * calls hooked from other parts of the VM system and do not appear here.
+ * (see vm/swap_pager.h).
  */
+
 static vm_object_t
 		swap_pager_alloc __P((void *handle, vm_ooffset_t size,
 				      vm_prot_t prot, vm_ooffset_t offset));
 static void	swap_pager_dealloc __P((vm_object_t object));
-static boolean_t
-		swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex,
-					int *before, int *after));
 static int	swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static void	swap_pager_init __P((void));
-static void spc_free __P((swp_clean_t));
+static void	swap_pager_unswapped __P((vm_page_t));
 
 struct pagerops swappagerops = {
-	swap_pager_init,
-	swap_pager_alloc,
-	swap_pager_dealloc,
-	swap_pager_getpages,
-	swap_pager_putpages,
-	swap_pager_haspage,
-	swap_pager_sync
+	swap_pager_init,	/* early system initialization of pager	*/
+	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
+	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
+	swap_pager_getpages,	/* pagein				*/
+	swap_pager_putpages,	/* pageout				*/
+	swap_pager_haspage,	/* get backing store status for page	*/
+	swap_pager_unswapped	/* remove swap related to page		*/
 };
 
-static int npendingio;
-static int dmmin;
+/*
+ * dmmax is in page-sized chunks with the new swap system.  It was
+ * dev-bsized chunks in the old.
+ *
+ * swap_*() routines are externally accessible.  swp_*() routines are
+ * internal.
+ */
+
 int dmmax;
+static int dmmax_mask;
+int nswap_lowat = 128;		/* in pages, swap_pager_full warning	*/
+int nswap_hiwat = 256;		/* in pages, swap_pager_full warning	*/
+
+static __inline void	swp_sizecheck __P((void));
+static void	swp_pager_sync_iodone __P((struct buf *bp));
+static void	swp_pager_async_iodone __P((struct buf *bp));
+
+/*
+ * Swap bitmap functions
+ */
+
+static __inline void	swp_pager_freeswapspace __P((daddr_t blk, int npages));
+static __inline daddr_t	swp_pager_getswapspace __P((int npages));
+
+/*
+ * Metadata functions
+ */
+
+static void swp_pager_meta_build __P((vm_object_t, daddr_t, daddr_t, int));
+static void swp_pager_meta_free __P((vm_object_t, daddr_t, daddr_t));
+static void swp_pager_meta_free_all __P((vm_object_t));
+static daddr_t swp_pager_meta_ctl __P((vm_object_t, vm_pindex_t, int));
 
-static int	swap_pager_block_index __P((vm_pindex_t pindex));
-static int	swap_pager_block_offset __P((vm_pindex_t pindex));
-static daddr_t *swap_pager_diskaddr __P((vm_object_t object,
-					  vm_pindex_t pindex, int *valid));
-static void	swap_pager_finish __P((swp_clean_t spc));
-static void	swap_pager_free_swap __P((vm_object_t object));
-static void	swap_pager_freeswapspace __P((vm_object_t object,
-					      unsigned int from,
-					      unsigned int to));
-static int	swap_pager_getswapspace __P((vm_object_t object,
-					     unsigned int amount,
-					     daddr_t *rtval));
-static void	swap_pager_iodone __P((struct buf *));
-static void	swap_pager_iodone1 __P((struct buf *bp));
-static void	swap_pager_reclaim __P((void));
-static void	swap_pager_ridpages __P((vm_page_t *m, int count,
-					 int reqpage));
-static void	swap_pager_setvalid __P((vm_object_t object,
-					 vm_offset_t offset, int valid));
-static __inline void	swapsizecheck __P((void));
-
-#define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE)))
+/*
+ * SWP_SIZECHECK() -	update swap_pager_full indication
+ *	
+ *	update the swap_pager_full indication and warn when we are
+ *	about to run out of swap space.
+ *
+ *	No restrictions on call
+ *	This routine may not block.
+ *	This routine must be called at splvm()
+ */
 
 static __inline void
-swapsizecheck()
+swp_sizecheck()
 {
-	if (vm_swap_size < 128 * btodb(PAGE_SIZE)) {
+	if (vm_swap_size < nswap_lowat) {
 		if (swap_pager_full == 0)
 			printf("swap_pager: out of swap space\n");
 		swap_pager_full = 1;
-	} else if (vm_swap_size > 192 * btodb(PAGE_SIZE))
+	} else if (vm_swap_size > nswap_hiwat) {
 		swap_pager_full = 0;
+	}
 }
 
+/*
+ * SWAP_PAGER_INIT() -	initialize the swap pager!
+ *
+ *	Expected to be started from system init.  NOTE:  This code is run 
+ *	before much else so be careful what you depend on.  Most of the VM
+ *	system has yet to be initialized at this point.
+ */
+
 static void
 swap_pager_init()
 {
-	int maxsafepending;
-	TAILQ_INIT(&swap_pager_object_list);
-	TAILQ_INIT(&swap_pager_un_object_list);
-
 	/*
-	 * Initialize clean lists
+	 * Initialize object lists
 	 */
-	TAILQ_INIT(&swap_pager_inuse);
-	TAILQ_INIT(&swap_pager_done);
-	TAILQ_INIT(&swap_pager_free);
-	swap_pager_free_count = 0;
+	int i;
+
+	for (i = 0; i < NOBJLISTS; ++i)
+		TAILQ_INIT(&swap_pager_object_list[i]);
+	TAILQ_INIT(&swap_pager_un_object_list);
 
 	/*
-	 * Calculate the swap allocation constants.
+	 * Device Stripe, in PAGE_SIZE'd blocks
 	 */
-	dmmin = PAGE_SIZE / DEV_BSIZE;
-	dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2;
-
-	maxsafepending = cnt.v_free_min - cnt.v_free_reserved;
-	npendingio = NPENDINGIO;
-	max_pageout_cluster = MAX_PAGEOUT_CLUSTER;
-
-	if ((2 * NPENDINGIO * MAX_PAGEOUT_CLUSTER) > maxsafepending) {
-		max_pageout_cluster = MAX_PAGEOUT_CLUSTER / 2;
-		npendingio = maxsafepending / (2 * max_pageout_cluster);
-		if (npendingio < 2)
-			npendingio = 2;
-	}
+
+	dmmax = SWB_NPAGES * 2;
+	dmmax_mask = ~(dmmax - 1);
 }
 
+/*
+ * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
+ *
+ *	Expected to be started from pageout process once, prior to entering
+ *	its main loop.
+ */
+
 void
 swap_pager_swap_init()
 {
-	swp_clean_t spc;
-	struct buf *bp;
-	int i;
+	int n;
 
 	/*
-	 * kva's are allocated here so that we dont need to keep doing
-	 * kmem_alloc pageables at runtime
+	 * Number of in-transit swap bp operations.  Don't
+	 * exhaust the pbufs completely.  Make sure we
+	 * initialize workable values (0 will work for hysteresis
+	 * but it isn't very efficient).
+	 *
+	 * The max_pageout_cluster is constrained by the bp->b_pages[]
+	 * array (MAXPHYS/PAGE_SIZE) and our locally defined
+	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
+	 * constrained by the swap device interleave stripe size.
 	 */
-	for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) {
-		spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * max_pageout_cluster);
-		if (!spc->spc_kva) {
-			break;
-		}
-		spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL);
-		if (!spc->spc_bp) {
-			kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
-			break;
-		}
-		spc->spc_flags = 0;
-		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
-		swap_pager_free_count++;
-	}
-}
 
-int
-swap_pager_swp_alloc(object, wait)
-	vm_object_t object;
-	int wait;
-{
-	sw_blk_t swb;
-	int nblocks;
-	int i, j;
-
-	nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES;
-	swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait);
-	if (swb == NULL)
-		return 1;
-
-	for (i = 0; i < nblocks; i++) {
-		swb[i].swb_valid = 0;
-		swb[i].swb_locked = 0;
-		for (j = 0; j < SWB_NPAGES; j++)
-			swb[i].swb_block[j] = SWB_EMPTY;
-	}
+	nsw_rcount = (nswbuf + 1) / 2;
+	nsw_wcount = (nswbuf + 3) / 4;
+	nsw_hysteresis = nsw_wcount / 2;
+	max_pageout_cluster = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
 
-	object->un_pager.swp.swp_nblocks = nblocks;
-	object->un_pager.swp.swp_allocsize = 0;
-	object->un_pager.swp.swp_blocks = swb;
-	object->un_pager.swp.swp_poip = 0;
+	/*
+	 * Initialize our zone.  Right now I'm just guessing on the number
+	 * we need based on the number of pages in the system.  Each swblock
+	 * can hold 16 pages, so this is probably overkill.
+	 */
 
-	if (object->handle != NULL) {
-		TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list);
-	} else {
-		TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
-	}
+	n = cnt.v_page_count * 2;
 
-	return 0;
+	swap_zone = zinit(
+	    "SWAPMETA", 
+	    sizeof(struct swblock), 
+	    n,
+	    ZONE_INTERRUPT, 
+	    1
+	);
+
+	/*
+	 * Initialize our meta-data hash table.  The swapper does not need to
+	 * be quite as efficient as the VM system, so we do not use an 
+	 * oversized hash table.
+	 *
+	 * 	n: 		size of hash table, must be power of 2
+	 *	swhash_mask:	hash table index mask
+	 */
+
+	for (n = 1; n < cnt.v_page_count / 4; n <<= 1)
+		;
+
+	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK);
+	bzero(swhash, sizeof(struct swblock *) * n);
+
+	swhash_mask = n - 1;
 }
 
 /*
- * Allocate an object and associated resources.
- * Note that if we are called from the pageout daemon (handle == NULL)
- * we should not wait for memory as it could resulting in deadlock.
+ * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
+ *			its metadata structures.
+ *
+ *	This routine is called from the mmap and fork code to create a new
+ *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
+ *	and then converting it with swp_pager_meta_build().
+ *
+ *	This routine may block in vm_object_allocate() and create a named
+ *	object lookup race, so we must interlock.   We must also run at
+ *	splvm() for the object lookup to handle races with interrupts, but
+ *	we do not have to maintain splvm() in between the lookup and the
+ *	add because (I believe) it is not possible to attempt to create
+ *	a new swap object w/handle when a default object with that handle
+ *	already exists.
  */
+
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		 vm_ooffset_t offset)
 {
 	vm_object_t object;
 
-	/*
-	 * If this is a "named" anonymous region, look it up and use the
-	 * object if it exists, otherwise allocate a new one.
-	 */
 	if (handle) {
-		object = vm_pager_object_lookup(&swap_pager_object_list, handle);
+		/*
+		 * Reference existing named region or allocate new one.  There
+		 * should not be a race here against swp_pager_meta_build()
+		 * as called from vm_page_remove() in regards to the lookup
+		 * of the handle.
+		 */
+
+		while (sw_alloc_interlock) {
+			sw_alloc_interlock = -1;
+			tsleep(&sw_alloc_interlock, PVM, "swpalc", 0);
+		}
+		sw_alloc_interlock = 1;
+
+		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
+
 		if (object != NULL) {
 			vm_object_reference(object);
 		} else {
-			/*
-			 * XXX - there is a race condition here. Two processes
-			 * can request the same named object simultaneuously,
-			 * and if one blocks for memory, the result is a disaster.
-			 * Probably quite rare, but is yet another reason to just
-			 * rip support of "named anonymous regions" out altogether.
-			 */
-			object = vm_object_allocate(OBJT_SWAP,
+			object = vm_object_allocate(OBJT_DEFAULT,
 				OFF_TO_IDX(offset + PAGE_MASK + size));
 			object->handle = handle;
-			(void) swap_pager_swp_alloc(object, M_WAITOK);
+
+			swp_pager_meta_build(
+			    object,
+			    0,
+			    SWAPBLK_NONE,
+			    0
+			);
 		}
+
+		if (sw_alloc_interlock < 0)
+			wakeup(&sw_alloc_interlock);
+
+		sw_alloc_interlock = 0;
 	} else {
-		object = vm_object_allocate(OBJT_SWAP,
+		object = vm_object_allocate(OBJT_DEFAULT,
 			OFF_TO_IDX(offset + PAGE_MASK + size));
-		(void) swap_pager_swp_alloc(object, M_WAITOK);
+
+		swp_pager_meta_build(
+		    object,
+		    0,
+		    SWAPBLK_NONE,
+		    0
+		);
 	}
 
 	return (object);
 }
 
 /*
- * returns disk block associated with pager and offset
- * additionally, as a side effect returns a flag indicating
- * if the block has been written
+ * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
+ *
+ *	The swap backing for the object is destroyed.  The code is 
+ *	designed such that we can reinstantiate it later, but this
+ *	routine is typically called only when the entire object is
+ *	about to be destroyed.
+ *
+ *	This routine may block, but no longer does. 
+ *
+ *	The object must be locked or unreferenceable.
  */
 
-static __inline daddr_t *
-swap_pager_diskaddr(object, pindex, valid)
+static void
+swap_pager_dealloc(object)
 	vm_object_t object;
-	vm_pindex_t pindex;
-	int *valid;
 {
-	register sw_blk_t swb;
-	int ix;
-
-	if (valid)
-		*valid = 0;
-	ix = pindex / SWB_NPAGES;
-	if ((ix >= object->un_pager.swp.swp_nblocks) ||
-	    (pindex >= object->size)) {
-		return (FALSE);
+	/*
+	 * Remove from list right away so lookups will fail if we block for
+	 * pageout completion.
+	 */
+
+	if (object->handle == NULL) {
+		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
+	} else {
+		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
 	}
-	swb = &object->un_pager.swp.swp_blocks[ix];
-	ix = pindex % SWB_NPAGES;
-	if (valid)
-		*valid = swb->swb_valid & (1 << ix);
-	return &swb->swb_block[ix];
-}
 
-/*
- * Utility routine to set the valid (written) bit for
- * a block associated with a pager and offset
- */
-static void
-swap_pager_setvalid(object, offset, valid)
-	vm_object_t object;
-	vm_offset_t offset;
-	int valid;
-{
-	register sw_blk_t swb;
-	int ix;
+	vm_object_pip_wait(object, "swpdea");
 
-	ix = offset / SWB_NPAGES;
-	if (ix >= object->un_pager.swp.swp_nblocks)
-		return;
+	/*
+	 * Free all remaining metadata.  We only bother to free it from 
+	 * the swap meta data.  We do not attempt to free swapblk's still
+	 * associated with vm_page_t's for this object.  We do not care
+	 * if paging is still in progress on some objects.
+	 */
 
-	swb = &object->un_pager.swp.swp_blocks[ix];
-	ix = offset % SWB_NPAGES;
-	if (valid)
-		swb->swb_valid |= (1 << ix);
-	else
-		swb->swb_valid &= ~(1 << ix);
-	return;
+	swp_pager_meta_free_all(object);
 }
 
+/************************************************************************
+ *			SWAP PAGER BITMAP ROUTINES			*
+ ************************************************************************/
+
 /*
- * this routine allocates swap space with a fragmentation
- * minimization policy.
+ * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
+ *
+ *	Allocate swap for the requested number of pages.  The starting
+ *	swap block number (a page index) is returned or SWAPBLK_NONE
+ *	if the allocation failed.
+ *
+ *	Also has the side effect of advising that somebody made a mistake
+ *	when they configured swap and didn't configure enough.
+ *
+ *	Must be called at splvm() to avoid races with bitmap frees from
+ *	vm_page_remove() aka swap_pager_page_removed().
+ *
+ *	This routine may not block
+ *	This routine must be called at splvm().
  */
-static int
-swap_pager_getswapspace(object, amount, rtval)
-	vm_object_t object;
-	unsigned int amount;
-	daddr_t *rtval;
+
+static __inline daddr_t
+swp_pager_getswapspace(npages)
+	int npages;
 {
-	unsigned location;
+	daddr_t blk;
 
-	vm_swap_size -= amount;
-		
-	if (!rlist_alloc(&swaplist, amount, &location)) {
-		vm_swap_size += amount;
-		return 0;
+	if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
+		printf("swap_pager_getswapspace: failed\n");
 	} else {
-		swapsizecheck();
-		object->un_pager.swp.swp_allocsize += amount;
-		*rtval = location;
-		return 1;
+		vm_swap_size -= npages;
+		swp_sizecheck();
 	}
+	return(blk);
 }
 
 /*
- * this routine frees swap space with a fragmentation
- * minimization policy.
+ * SWP_PAGER_FREESWAPSPACE() -	free raw swap space 
+ *
+ *	This routine returns the specified swap blocks back to the bitmap.
+ *
+ *	Note:  This routine may not block (it could in the old swap code),
+ *	and through the use of the new blist routines it does not block.
+ *
+ *	We must be called at splvm() to avoid races with bitmap frees from
+ *	vm_page_remove() aka swap_pager_page_removed().
+ *
+ *	This routine may not block
+ *	This routine must be called at splvm().
  */
-static void
-swap_pager_freeswapspace(object, from, to)
-	vm_object_t object;
-	unsigned int from;
-	unsigned int to;
+
+static __inline void
+swp_pager_freeswapspace(blk, npages)
+	daddr_t blk;
+	int npages;
 {
-	rlist_free(&swaplist, from, to);
-	vm_swap_size += (to - from) + 1;
-	object->un_pager.swp.swp_allocsize -= (to - from) + 1;
-	swapsizecheck();
+	blist_free(swapblist, blk, npages);
+	vm_swap_size += npages;
+	swp_sizecheck();
 }
+
 /*
- * this routine frees swap blocks from a specified pager
+ * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
+ *				range within an object.
+ *
+ *	This is a globally accessible routine.
+ *
+ *	This routine removes swapblk assignments from swap metadata.
+ *
+ *	The external callers of this routine typically have already destroyed 
+ *	or renamed vm_page_t's associated with this range in the object so 
+ *	we should be ok.
  */
+
 void
 swap_pager_freespace(object, start, size)
 	vm_object_t object;
 	vm_pindex_t start;
 	vm_size_t size;
 {
-	vm_pindex_t i;
-	int s;
-
-	s = splvm();
-	for (i = start; i < start + size; i += 1) {
-		int valid;
-		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
-
-		if (addr && *addr != SWB_EMPTY) {
-			swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1);
-			if (valid) {
-				swap_pager_setvalid(object, i, 0);
-			}
-			*addr = SWB_EMPTY;
-		}
-	}
-	splx(s);
+	swp_pager_meta_free(object, start, size);
 }
 
 /*
- * same as freespace, but don't free, just force a DMZ next time
- */
-void
-swap_pager_dmzspace(object, start, size)
-	vm_object_t object;
-	vm_pindex_t start;
-	vm_size_t size;
-{
-	vm_pindex_t i;
-	int s;
-
-	s = splvm();
-	for (i = start; i < start + size; i += 1) {
-		int valid;
-		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
-
-		if (addr && *addr != SWB_EMPTY) {
-			if (valid) {
-				swap_pager_setvalid(object, i, 0);
-			}
-		}
-	}
-	splx(s);
-}
-
-static void
-swap_pager_free_swap(object)
-	vm_object_t object;
-{
-	register int i, j;
-	register sw_blk_t swb;
-	int first_block=0, block_count=0;
-	int s;
-	/*
-	 * Free left over swap blocks
-	 */
-	swb = object->un_pager.swp.swp_blocks;
-	if (swb == NULL) {
-		return;
-	}
-
-	s = splvm();
-	for (i = 0; i < object->un_pager.swp.swp_nblocks; i++, swb++) {
-		for (j = 0; j < SWB_NPAGES; j++) {
-			if (swb->swb_block[j] != SWB_EMPTY) {
-				/*
-   				 * initially the length of the run is zero
-   				 */
-				if (block_count == 0) {
-					first_block = swb->swb_block[j];
-					block_count = btodb(PAGE_SIZE);
-					swb->swb_block[j] = SWB_EMPTY;
-				/*
-   				 * if the new block can be included into the current run
-   				 */
-				} else if (swb->swb_block[j] == first_block + block_count) {
-					block_count += btodb(PAGE_SIZE);
-					swb->swb_block[j] = SWB_EMPTY;
-				/*
-   				 * terminate the previous run, and start a new one
-   				 */
-				} else {
-					swap_pager_freeswapspace(object, first_block,
-   					(unsigned) first_block + block_count - 1);
-					first_block = swb->swb_block[j];
-					block_count = btodb(PAGE_SIZE);
-					swb->swb_block[j] = SWB_EMPTY;
-				}
-			}
-		}
-	}
-
-	if (block_count) {
-		swap_pager_freeswapspace(object, first_block,
-		   	 (unsigned) first_block + block_count - 1);
-	}
-	splx(s);
-}
-
-
-/*
- * swap_pager_reclaim frees up over-allocated space from all pagers
- * this eliminates internal fragmentation due to allocation of space
- * for segments that are never swapped to. It has been written so that
- * it does not block until the rlist_free operation occurs; it keeps
- * the queues consistant.
- */
-
-/*
- * Maximum number of blocks (pages) to reclaim per pass
- */
-#define MAXRECLAIM 128
-
-static void
-swap_pager_reclaim()
-{
-	vm_object_t object;
-	int i, j, k;
-	int s;
-	int reclaimcount;
-	static struct {
-		int address;
-		vm_object_t object;
-	} reclaims[MAXRECLAIM];
-	static int in_reclaim;
-
-	/*
-	 * allow only one process to be in the swap_pager_reclaim subroutine
-	 */
-	s = splvm();
-	if (in_reclaim) {
-		tsleep(&in_reclaim, PSWP, "swrclm", 0);
-		splx(s);
-		return;
-	}
-	in_reclaim = 1;
-	reclaimcount = 0;
-
-	/* for each pager queue */
-	for (k = 0; swp_qs[k]; k++) {
-
-		object = TAILQ_FIRST(swp_qs[k]);
-		while (object && (reclaimcount < MAXRECLAIM)) {
-
-			/*
-			 * see if any blocks associated with a pager has been
-			 * allocated but not used (written)
-			 */
-			if ((object->flags & OBJ_DEAD) == 0 &&
-				(object->paging_in_progress == 0)) {
-				for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) {
-					sw_blk_t swb = &object->un_pager.swp.swp_blocks[i];
-
-					if (swb->swb_locked)
-						continue;
-					for (j = 0; j < SWB_NPAGES; j++) {
-						if (swb->swb_block[j] != SWB_EMPTY &&
-						    (swb->swb_valid & (1 << j)) == 0) {
-							reclaims[reclaimcount].address = swb->swb_block[j];
-							reclaims[reclaimcount++].object = object;
-							swb->swb_block[j] = SWB_EMPTY;
-							if (reclaimcount >= MAXRECLAIM)
-								goto rfinished;
-						}
-					}
-				}
-			}
-			object = TAILQ_NEXT(object, pager_object_list);
-		}
-	}
-
-rfinished:
-
-	/*
-	 * free the blocks that have been added to the reclaim list
-	 */
-	for (i = 0; i < reclaimcount; i++) {
-		swap_pager_freeswapspace(reclaims[i].object,
-		    reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1);
-	}
-	splx(s);
-	in_reclaim = 0;
-	wakeup(&in_reclaim);
-}
-
-
-/*
- * swap_pager_copy copies blocks from one pager to another and
- * destroys the source pager
+ * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
+ *			and destroy the source.
+ *
+ *	Copy any valid swapblks from the source to the destination.  In
+ *	cases where both the source and destination have a valid swapblk,
+ *	we keep the destination's.
+ *
+ *	This routine is allowed to block.  It may block allocating metadata
+ *	indirectly through swp_pager_meta_build() or if paging is still in
+ *	progress on the source. 
+ *
+ *	XXX vm_page_collapse() kinda expects us not to block because we 
+ *	supposedly do not need to allocate memory, but for the moment we
+ *	*may* have to get a little memory from the zone allocator, but
+ *	it is taken from the interrupt memory.  We should be ok. 
+ *
+ *	The source object contains no vm_page_t's (which is just as well)
+ *
+ *	The source object is of type OBJT_SWAP.
+ *
+ *	The source and destination objects must be 
+ *	locked or inaccessible (XXX are they ???)
  */
 
 void
-swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset,
-	offset, destroysource)
+swap_pager_copy(srcobject, dstobject, offset, destroysource)
 	vm_object_t srcobject;
-	vm_pindex_t srcoffset;
 	vm_object_t dstobject;
-	vm_pindex_t dstoffset;
 	vm_pindex_t offset;
 	int destroysource;
 {
 	vm_pindex_t i;
-	int origsize;
-	int s;
-
-	if (vm_swap_size)
-		no_swap_space = 0;
-
-	origsize = srcobject->un_pager.swp.swp_allocsize;
 
 	/*
-	 * remove the source object from the swap_pager internal queue
+	 * If destroysource is set, we remove the source object from the 
+	 * swap_pager internal queue now. 
 	 */
+
 	if (destroysource) {
 		if (srcobject->handle == NULL) {
-			TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list);
+			TAILQ_REMOVE(
+			    &swap_pager_un_object_list, 
+			    srcobject, 
+			    pager_object_list
+			);
 		} else {
-			TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list);
+			TAILQ_REMOVE(
+			    NOBJLIST(srcobject->handle),
+			    srcobject,
+			    pager_object_list
+			);
 		}
 	}
 
-	s = splvm();
-	while (srcobject->un_pager.swp.swp_poip) {
-		tsleep(srcobject, PVM, "spgout", 0);
-	}
-
 	/*
-	 * clean all of the pages that are currently active and finished
+	 * transfer source to destination.
 	 */
-	if (swap_pager_free_pending)
-		swap_pager_sync();
 
-	/*
-	 * transfer source to destination
-	 */
-	for (i = 0; i < dstobject->size; i += 1) {
-		int srcvalid, dstvalid;
-		daddr_t *srcaddrp = swap_pager_diskaddr(srcobject,
-				i + offset + srcoffset, &srcvalid);
-		daddr_t *dstaddrp;
+	for (i = 0; i < dstobject->size; ++i) {
+		daddr_t dstaddr;
 
 		/*
-		 * see if the source has space allocated
+		 * Locate (without changing) the swapblk on the destination,
+		 * unless it is invalid in which case free it silently, or
+		 * if the destination is a resident page, in which case the
+		 * source is thrown away.
 		 */
-		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
+
+		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
+
+		if (dstaddr == SWAPBLK_NONE) {
 			/*
-			 * if the source is valid and the dest has no space,
-			 * then copy the allocation from the srouce to the
-			 * dest.
+			 * Destination has no swapblk and is not resident,
+			 * copy source.
 			 */
-			if (srcvalid) {
-				dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset,
-							&dstvalid);
-				/*
-				 * if the dest already has a valid block,
-				 * deallocate the source block without
-				 * copying.
-				 */
-				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
-					swap_pager_freeswapspace(dstobject, *dstaddrp,
-						*dstaddrp + btodb(PAGE_SIZE) - 1);
-					*dstaddrp = SWB_EMPTY;
-				}
-				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
-					*dstaddrp = *srcaddrp;
-					*srcaddrp = SWB_EMPTY;
-					dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE);
-					srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE);
-					swap_pager_setvalid(dstobject, i + dstoffset, 1);
-				}
-			}
+			daddr_t srcaddr;
+
+			srcaddr = swp_pager_meta_ctl(
+			    srcobject, 
+			    i + offset,
+			    SWM_POP
+			);
+
+			if (srcaddr != SWAPBLK_NONE)
+				swp_pager_meta_build(dstobject, i, srcaddr, 1);
+		} else {
 			/*
-			 * if the source is not empty at this point, then
-			 * deallocate the space.
+			 * Destination has valid swapblk or it is represented
+			 * by a resident page.  We destroy the sourceblock.
 			 */
-			if (*srcaddrp != SWB_EMPTY) {
-				swap_pager_freeswapspace(srcobject, *srcaddrp,
-					*srcaddrp + btodb(PAGE_SIZE) - 1);
-				*srcaddrp = SWB_EMPTY;
-			}
+			
+			swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
 		}
 	}
-	splx(s);
 
 	/*
-	 * Free left over swap blocks
+	 * Free left over swap blocks in source.
+	 *
+	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
+	 * double-remove the object from the swap queues.
 	 */
-	if (destroysource) {
-		swap_pager_free_swap(srcobject);
 
-		if (srcobject->un_pager.swp.swp_allocsize) {
-			printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n",
-			    srcobject->un_pager.swp.swp_allocsize, origsize);
-		}
-
-		free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA);
-		srcobject->un_pager.swp.swp_blocks = NULL;
+	if (destroysource) {
+		swp_pager_meta_free_all(srcobject);
+		/*
+		 * Reverting the type is not necessary, the caller is going
+		 * to destroy srcobject directly, but I'm doing it here
+		 * for consistancy since we've removed the object from its
+		 * queues.
+		 */
+		srcobject->type = OBJT_DEFAULT;
 	}
 	return;
 }
 
-static void
-swap_pager_dealloc(object)
+/*
+ * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
+ *				the requested page.
+ *
+ *	We determine whether good backing store exists for the requested
+ *	page and return TRUE if it does, FALSE if it doesn't.
+ *
+ *	If TRUE, we also try to determine how much valid, contiguous backing
+ *	store exists before and after the requested page within a reasonable
+ *	distance.  We do not try to restrict it to the swap device stripe
+ *	(that is handled in getpages/putpages).  It probably isn't worth
+ *	doing here.
+ */
+
+boolean_t
+swap_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
+	vm_pindex_t pindex;
+	int *before;
+	int *after;
 {
-	int s;
-	sw_blk_t swb;
+	daddr_t blk0;
 
 	/*
-	 * Remove from list right away so lookups will fail if we block for
-	 * pageout completion.
+	 * do we have good backing store at the requested index ?
 	 */
-	if (object->handle == NULL) {
-		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
-	} else {
-		TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list);
-	}
 
-	/*
-	 * Wait for all pageouts to finish and remove all entries from
-	 * cleaning list.
-	 */
+	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 
-	s = splvm();
-	while (object->un_pager.swp.swp_poip) {
-		tsleep(object, PVM, "swpout", 0);
+	if (blk0 & SWAPBLK_NONE) {
+		if (before)
+			*before = 0;
+		if (after)
+			*after = 0;
+		return (FALSE);
 	}
-	splx(s);
-
-	if (swap_pager_free_pending)
-		swap_pager_sync();
 
 	/*
-	 * Free left over swap blocks
+	 * find backwards-looking contiguous good backing store
 	 */
-	swap_pager_free_swap(object);
 
-	if (object->un_pager.swp.swp_allocsize) {
-		printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n",
-		    object->un_pager.swp.swp_allocsize);
-	}
-	swb = object->un_pager.swp.swp_blocks;
-	if (swb) {
-		/*
-   		* Free swap management resources
-   		*/
-		free(swb, M_VMPGDATA);
-		object->un_pager.swp.swp_blocks = NULL;
-	}
-}
+	if (before != NULL) {
+		int i;
 
-static __inline int
-swap_pager_block_index(pindex)
-	vm_pindex_t pindex;
-{
-	return (pindex / SWB_NPAGES);
-}
-
-static __inline int
-swap_pager_block_offset(pindex)
-	vm_pindex_t pindex;
-{
-	return (pindex % SWB_NPAGES);
-}
+		for (i = 1; i < (SWB_NPAGES/2); ++i) {
+			daddr_t blk;
 
-/*
- * swap_pager_haspage returns TRUE if the pager has data that has
- * been written out.
- */
-static boolean_t
-swap_pager_haspage(object, pindex, before, after)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *before;
-	int *after;
-{
-	register sw_blk_t swb;
-	int ix;
-
-	if (before != NULL)
-		*before = 0;
-	if (after != NULL)
-		*after = 0;
-	ix = pindex / SWB_NPAGES;
-	if (ix >= object->un_pager.swp.swp_nblocks) {
-		return (FALSE);
+			if (i > pindex)
+				break;
+			blk = swp_pager_meta_ctl(object, pindex - i, 0);
+			if (blk & SWAPBLK_NONE)
+				break;
+			if (blk != blk0 - i)
+				break;
+		}
+		*before = (i - 1);
 	}
-	swb = &object->un_pager.swp.swp_blocks[ix];
-	ix = pindex % SWB_NPAGES;
-
-	if (swb->swb_block[ix] != SWB_EMPTY) {
-
-		if (swb->swb_valid & (1 << ix)) {
-			int tix;
-			if (before) {
-				for(tix = ix - 1; tix >= 0; --tix) {
-					if ((swb->swb_valid & (1 << tix)) == 0)
-						break;
-					if ((swb->swb_block[tix] +
-						(ix - tix) * (PAGE_SIZE/DEV_BSIZE)) !=
-						swb->swb_block[ix])
-						break;
-					(*before)++;
-				}
-			}
 
-			if (after) {
-				for(tix = ix + 1; tix < SWB_NPAGES; tix++) {
-					if ((swb->swb_valid & (1 << tix)) == 0)
-						break;
-					if ((swb->swb_block[tix] -
-						(tix - ix) * (PAGE_SIZE/DEV_BSIZE)) !=
-						swb->swb_block[ix])
-						break;
-					(*after)++;
-				}
-			}
+	/*
+	 * find forward-looking contiguous good backing store
+	 */
 
-			return TRUE;
+	if (after != NULL) {
+		int i;
+
+		for (i = 1; i < (SWB_NPAGES/2); ++i) {
+			daddr_t blk;
+
+			blk = swp_pager_meta_ctl(object, pindex + i, 0);
+			if (blk & SWAPBLK_NONE)
+				break;
+			if (blk != blk0 + i)
+				break;
 		}
+		*after = (i - 1);
 	}
-	return (FALSE);
-}
 
-/*
- * Wakeup based upon spc state
- */
-static void
-spc_wakeup(void)
-{
-	if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) {
-		swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT;
-		wakeup(&swap_pager_needflags);
-	} else if ((swap_pager_needflags & SWAP_FREE_NEEDED) &&
-		swap_pager_free_count >= ((2 * npendingio) / 3)) {
-		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
-		wakeup(&swap_pager_free);
-	}
+	return (TRUE);
 }
 
 /*
- * Free an spc structure
+ * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
+ *
+ *	This removes any associated swap backing store, whether valid or
+ *	not, from the page.  
+ *
+ *	This routine is typically called when a page is made dirty, at
+ *	which point any associated swap can be freed.  MADV_FREE also
+ *	calls us in a special-case situation
+ *
+ *	NOTE!!!  If the page is clean and the swap was valid, the caller
+ *	should make the page dirty before calling this routine.  This routine
+ *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
+ *	depends on it.
+ *
+ *	This routine may not block
  */
-static void
-spc_free(spc)
-	swp_clean_t spc;
-{
-	spc->spc_flags = 0;
-	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
-	swap_pager_free_count++;
-	if (swap_pager_needflags) {
-		spc_wakeup();
-	}
-}
 
-/*
- * swap_pager_ridpages is a convienience routine that deallocates all
- * but the required page.  this is usually used in error returns that
- * need to invalidate the "extra" readahead pages.
- */
 static void
-swap_pager_ridpages(m, count, reqpage)
-	vm_page_t *m;
-	int count;
-	int reqpage;
+swap_pager_unswapped(m)
+	vm_page_t m;
 {
-	int i;
-
-	for (i = 0; i < count; i++) {
-		if (i != reqpage) {
-			vm_page_free(m[i]);
-		}
-	}
+	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 }
 
 /*
- * swap_pager_iodone1 is the completion routine for both reads and async writes
+ * SWAP_PAGER_GETPAGES() - bring pages in from swap
+ *
+ *	Attempt to retrieve (m, count) pages from backing store, but make
+ *	sure we retrieve at least m[reqpage].  We try to load in as large
+ *	a chunk surrounding m[reqpage] as is contiguous in swap and which
+ *	belongs to the same object.
+ *
+ *	The code is designed for asynchronous operation and 
+ *	immediate-notification of 'reqpage' but tends not to be
+ *	used that way.  Please do not optimize-out this algorithmic
+ *	feature, I intend to improve on it in the future.
+ *
+ *	The parent has a single vm_object_pip_add() reference prior to
+ *	calling us and we should return with the same.
+ *
+ *	The parent has BUSY'd the pages.  We should return with 'm'
+ *	left busy, but the others adjusted.
  */
-static void
-swap_pager_iodone1(bp)
-	struct buf *bp;
-{
-	bp->b_flags |= B_DONE;
-	bp->b_flags &= ~B_ASYNC;
-	wakeup(bp);
-}
 
 static int
 swap_pager_getpages(object, m, count, reqpage)
@@ -926,208 +745,235 @@ swap_pager_getpages(object, m, count, reqpage)
 	vm_page_t *m;
 	int count, reqpage;
 {
-	register struct buf *bp;
-	sw_blk_t swb[count];
-	register int s;
+	struct buf *bp;
+	vm_page_t mreq;
+	int s;
 	int i;
-	boolean_t rv;
-	vm_offset_t kva, off[count];
-	vm_pindex_t paging_offset;
-	int reqaddr[count];
-	int sequential;
-
-	int first, last;
-	int failed;
-	int reqdskregion;
-
-	object = m[reqpage]->object;
-	paging_offset = OFF_TO_IDX(object->paging_offset);
-	sequential = (m[reqpage]->pindex == (object->last_read + 1));
-
-	for (i = 0; i < count; i++) {
-		vm_pindex_t fidx = m[i]->pindex + paging_offset;
-		int ix = swap_pager_block_index(fidx);
-
-		if (ix >= object->un_pager.swp.swp_nblocks) {
-			int j;
-
-			if (i <= reqpage) {
-				swap_pager_ridpages(m, count, reqpage);
-				return (VM_PAGER_FAIL);
-			}
-			for (j = i; j < count; j++) {
-				vm_page_free(m[j]);
-			}
-			count = i;
+	int j;
+	daddr_t blk;
+	vm_offset_t kva;
+	vm_pindex_t lastpindex;
+
+	mreq = m[reqpage];
+
+#if !defined(MAX_PERF)
+	if (mreq->object != object) {
+		panic("swap_pager_getpages: object mismatch %p/%p", 
+		    object, 
+		    mreq->object
+		);
+	}
+#endif
+	/*
+	 * Calculate range to retrieve.  The pages have already been assigned
+	 * their swapblks.  We require a *contiguous* range that falls entirely
+	 * within a single device stripe.   If we do not supply it, bad things
+	 * happen.
+	 */
+
+
+	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
+
+	for (i = reqpage - 1; i >= 0; --i) {
+		daddr_t iblk;
+
+		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
+		if (iblk & SWAPBLK_NONE)
+			break;
+
+		if ((blk ^ iblk) & dmmax_mask)
+			break;
+
+		if (blk != iblk + (reqpage - i))
 			break;
-		}
-		swb[i] = &object->un_pager.swp.swp_blocks[ix];
-		off[i] = swap_pager_block_offset(fidx);
-		reqaddr[i] = swb[i]->swb_block[off[i]];
 	}
+	++i;
 
-	/* make sure that our required input request is existant */
+	for (j = reqpage + 1; j < count; ++j) {
+		daddr_t jblk;
 
-	if (reqaddr[reqpage] == SWB_EMPTY ||
-	    (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
-		swap_pager_ridpages(m, count, reqpage);
-		return (VM_PAGER_FAIL);
+		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
+		if (jblk & SWAPBLK_NONE)
+			break;
+
+		if ((blk ^ jblk) & dmmax_mask)
+			break;
+
+		if (blk != jblk - (j - reqpage))
+			break;
 	}
-	reqdskregion = reqaddr[reqpage] / dmmax;
 
 	/*
-	 * search backwards for the first contiguous page to transfer
+	 * If blk itself is bad, well, we can't do any I/O.  This should
+	 * already be covered as a side effect, but I'm making sure.
 	 */
-	failed = 0;
-	first = 0;
-	for (i = reqpage - 1; i >= 0; --i) {
-		if (sequential || failed || (reqaddr[i] == SWB_EMPTY) ||
-		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
-		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
-		    ((reqaddr[i] / dmmax) != reqdskregion)) {
-			failed = 1;
-			vm_page_free(m[i]);
-			if (first == 0)
-				first = i + 1;
-		}
+
+	if (blk & SWAPBLK_NONE) {
+		i = reqpage;
+		j = reqpage + 1;
 	}
+
 	/*
-	 * search forwards for the last contiguous page to transfer
+	 * free pages outside our collection range.   Note: we never free
+	 * mreq, it must remain busy throughout.
 	 */
-	failed = 0;
-	last = count;
-	for (i = reqpage + 1; i < count; i++) {
-		if (failed || (reqaddr[i] == SWB_EMPTY) ||
-		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
-		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
-		    ((reqaddr[i] / dmmax) != reqdskregion)) {
-			failed = 1;
-			vm_page_free(m[i]);
-			if (last == count)
-				last = i;
-		}
-	}
 
-	count = last;
-	if (first != 0) {
-		for (i = first; i < count; i++) {
-			m[i - first] = m[i];
-			reqaddr[i - first] = reqaddr[i];
-			off[i - first] = off[i];
+	{
+		int k;
+
+		for (k = 0; k < i; ++k) {
+			vm_page_free(m[k]);
+		}
+		for (k = j; k < count; ++k) {
+			vm_page_free(m[k]);
 		}
-		count -= first;
-		reqpage -= first;
 	}
-	++swb[reqpage]->swb_locked;
 
 	/*
-	 * at this point: "m" is a pointer to the array of vm_page_t for
-	 * paging I/O "count" is the number of vm_page_t entries represented
-	 * by "m" "object" is the vm_object_t for I/O "reqpage" is the index
-	 * into "m" for the page actually faulted
+	 * Return VM_PAGER_FAIL if we have nothing
+	 * to do.  Return mreq still busy, but the
+	 * others unbusied.
 	 */
 
+	if (blk & SWAPBLK_NONE)
+		return(VM_PAGER_FAIL);
+
+
 	/*
 	 * Get a swap buffer header to perform the IO
 	 */
-	bp = getpbuf();
+
+	bp = getpbuf(&nsw_rcount);
 	kva = (vm_offset_t) bp->b_data;
 
 	/*
 	 * map our page(s) into kva for input
+	 *
+	 * NOTE: B_PAGING is set by pbgetvp()
 	 */
-	pmap_qenter(kva, m, count);
 
-	bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING;
-	bp->b_iodone = swap_pager_iodone1;
+	pmap_qenter(kva, m + i, j - i);
+
+	bp->b_flags = B_BUSY | B_READ | B_CALL;
+	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 	crhold(bp->b_rcred);
 	crhold(bp->b_wcred);
 	bp->b_data = (caddr_t) kva;
-	bp->b_blkno = reqaddr[0];
-	bp->b_bcount = PAGE_SIZE * count;
-	bp->b_bufsize = PAGE_SIZE * count;
+	/*
+	 * b_blkno is in page-sized chunks.  swapblk is valid, too, so
+	 * we don't have to mask it against SWAPBLK_MASK.
+	 */
+	bp->b_blkno = blk - (reqpage - i);
+	bp->b_bcount = PAGE_SIZE * (j - i);
+	bp->b_bufsize = PAGE_SIZE * (j - i);
+	bp->b_pager.pg_reqpage = reqpage - i;
+
+	{
+		int k;
+
+		for (k = i; k < j; ++k) {
+			bp->b_pages[k - i] = m[k];
+			vm_page_flag_set(m[k], PG_SWAPINPROG);
+		}
+	}
+	bp->b_npages = j - i;
 
 	pbgetvp(swapdev_vp, bp);
 
 	cnt.v_swapin++;
-	cnt.v_swappgsin += count;
+	cnt.v_swappgsin += bp->b_npages;
+
+	/*
+	 * We still hold the lock on mreq, and our automatic completion routine
+	 * does not remove it.
+	 */
+
+	vm_object_pip_add(mreq->object, bp->b_npages);
+	lastpindex = m[j-1]->pindex;
+
 	/*
-	 * perform the I/O
+	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
+	 * this point because we automatically release it on completion.
+	 * Instead, we look at the one page we are interested in which we
+	 * still hold a lock on even through the I/O completion.
+	 *
+	 * The other pages in our m[] array are also released on completion,
+	 * so we cannot assume they are valid anymore either.
+	 *
+	 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
 	 */
+
 	VOP_STRATEGY(bp->b_vp, bp);
 
 	/*
-	 * wait for the sync I/O to complete
+	 * wait for the page we want to complete.  PG_SWAPINPROG is always
+	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
+	 * is set in the meta-data.
 	 */
+
 	s = splvm();
-	while ((bp->b_flags & B_DONE) == 0) {
-		if (tsleep(bp, PVM, "swread", hz*20)) {
+
+	while ((mreq->flags & PG_SWAPINPROG) != 0) {
+		vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
+		cnt.v_intrans++;
+		if (tsleep(mreq, PSWP, "swread", hz*20)) {
 			printf(
-"swap_pager: indefinite wait buffer: device: %#lx, blkno: %ld, size: %ld\n",
+			    "swap_pager: indefinite wait buffer: device:"
+				" %#lx, blkno: %ld, size: %ld\n",
 			    (u_long)bp->b_dev, (long)bp->b_blkno,
-			    (long)bp->b_bcount);
+			    (long)bp->b_bcount
+			);
 		}
 	}
 
-	if (bp->b_flags & B_ERROR) {
-		printf(
-"swap_pager: I/O error - pagein failed; blkno %ld, size %ld, error %d\n",
-		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
-		rv = VM_PAGER_ERROR;
-	} else {
-		rv = VM_PAGER_OK;
-	}
-
 	splx(s);
-	swb[reqpage]->swb_locked--;
-
-	/*
-	 * remove the mapping for kernel virtual
-	 */
-	pmap_qremove(kva, count);
 
 	/*
-	 * release the physical I/O buffer
-	 */
-	relpbuf(bp);
-	/*
-	 * finish up input if everything is ok
+	 * mreq is left bussied after completion, but all the other pages
+	 * are freed.  If we had an unrecoverable read error the page will
+	 * not be valid.
 	 */
-	if (rv == VM_PAGER_OK) {
-		for (i = 0; i < count; i++) {
-			m[i]->dirty = 0;
-			vm_page_flag_clear(m[i], PG_ZERO);
-			if (i != reqpage) {
-				/*
-				 * whether or not to leave the page
-				 * activated is up in the air, but we
-				 * should put the page on a page queue
-				 * somewhere. (it already is in the
-				 * object). After some emperical
-				 * results, it is best to deactivate
-				 * the readahead pages.
-				 */
-				vm_page_deactivate(m[i]);
 
-				/*
-				 * just in case someone was asking for
-				 * this page we now tell them that it
-				 * is ok to use
-				 */
-				m[i]->valid = VM_PAGE_BITS_ALL;
-				vm_page_wakeup(m[i]);
-			}
-		}
-
-		m[reqpage]->object->last_read = m[count-1]->pindex;
+	if (mreq->valid != VM_PAGE_BITS_ALL) {
+		return(VM_PAGER_ERROR);
 	} else {
-		swap_pager_ridpages(m, count, reqpage);
+		mreq->object->last_read = lastpindex;
+		return(VM_PAGER_OK);
 	}
-	return (rv);
+
+	/*
+	 * A final note: in a low swap situation, we cannot deallocate swap
+	 * and mark a page dirty here because the caller is likely to mark
+	 * the page clean when we return, causing the page to possibly revert 
+	 * to all-zero's later.
+	 */
 }
 
+/*
+ *	swap_pager_putpages: 
+ *
+ *	Assign swap (if necessary) and initiate I/O on the specified pages.
+ *
+ *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
+ *	are automatically converted to SWAP objects.
+ *
+ *	In a low memory situation we may block in VOP_STRATEGY(), but the new 
+ *	vm_page reservation system coupled with properly written VFS devices 
+ *	should ensure that no low-memory deadlock occurs.  This is an area
+ *	which needs work.
+ *
+ *	The parent has N vm_object_pip_add() references prior to
+ *	calling us and will remove references for rtvals[] that are
+ *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
+ *	completion.
+ *
+ *	The parent has soft-busy'd the pages it passes us and will unbusy
+ *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
+ *	We need to unbusy the rest on I/O completion.
+ */
+
 int
 swap_pager_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
@@ -1136,534 +982,749 @@ swap_pager_putpages(object, m, count, sync, rtvals)
 	boolean_t sync;
 	int *rtvals;
 {
-	register struct buf *bp;
-	sw_blk_t swb[count];
-	register int s;
-	int i, j, ix, firstidx, lastidx;
-	boolean_t rv;
-	vm_offset_t kva, off, fidx;
-	swp_clean_t spc;
-	vm_pindex_t paging_pindex;
-	int reqaddr[count];
-	int failed;
-
-	if (vm_swap_size)
-		no_swap_space = 0;
-
-	if (no_swap_space) {
-		for (i = 0; i < count; i++)
-			rtvals[i] = VM_PAGER_FAIL;
-		return VM_PAGER_FAIL;
+	int i;
+	int n = 0;
+	int grv = VM_PAGER_OK;
+
+#if !defined(MAX_PERF)
+	if (count && m[0]->object != object) {
+		panic("swap_pager_getpages: object mismatch %p/%p", 
+		    object, 
+		    m[0]->object
+		);
+	}
+#endif
+	/*
+	 * Step 1
+	 *
+	 * Turn object into OBJT_SWAP
+	 * check for bogus sysops
+	 * force sync if not pageout process
+	 */
+
+	if (object->type != OBJT_SWAP) {
+		swp_pager_meta_build(object, 0, SWAPBLK_NONE, 0);
 	}
 
 	if (curproc != pageproc)
 		sync = TRUE;
 
-	object = m[0]->object;
-	paging_pindex = OFF_TO_IDX(object->paging_offset);
-
-	failed = 0;
-	for (j = 0; j < count; j++) {
-		fidx = m[j]->pindex + paging_pindex;
-		ix = swap_pager_block_index(fidx);
-		swb[j] = 0;
-		if (ix >= object->un_pager.swp.swp_nblocks) {
-			rtvals[j] = VM_PAGER_FAIL;
-			failed = 1;
-			continue;
-		} else {
-			rtvals[j] = VM_PAGER_OK;
-		}
-		swb[j] = &object->un_pager.swp.swp_blocks[ix];
-		swb[j]->swb_locked++;
-		if (failed) {
-			rtvals[j] = VM_PAGER_FAIL;
-			continue;
-		}
-		off = swap_pager_block_offset(fidx);
-		reqaddr[j] = swb[j]->swb_block[off];
-		if (reqaddr[j] == SWB_EMPTY) {
-			daddr_t blk;
-			int tries;
-			int ntoget;
+	/*
+	 * Step 2
+	 *
+	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
+	 * The page is left dirty until the pageout operation completes
+	 * successfully.
+	 */
 
-			tries = 0;
-			s = splvm();
+	for (i = 0; i < count; i += n) {
+		int s;
+		int j;
+		struct buf *bp;
+		daddr_t blk;
 
-			/*
-			 * if any other pages have been allocated in this
-			 * block, we only try to get one page.
-			 */
-			for (i = 0; i < SWB_NPAGES; i++) {
-				if (swb[j]->swb_block[i] != SWB_EMPTY)
-					break;
-			}
+		/*
+		 * Maximum I/O size is limited by a number of factors.
+		 */
 
-			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
-			/*
-			 * this code is alittle conservative, but works (the
-			 * intent of this code is to allocate small chunks for
-			 * small objects)
-			 */
-			if ((off == 0) && ((fidx + ntoget) > object->size)) {
-				ntoget = object->size - fidx;
-			}
-	retrygetspace:
-			if (!swap_pager_full && ntoget > 1 &&
-			    swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE),
-				&blk)) {
-
-				for (i = 0; i < ntoget; i++) {
-					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
-					swb[j]->swb_valid = 0;
-				}
+		n = min(BLIST_MAX_ALLOC, count - i);
+		n = min(n, max_pageout_cluster);
 
-				reqaddr[j] = swb[j]->swb_block[off];
-			} else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE),
-				&swb[j]->swb_block[off])) {
-				/*
-				 * if the allocation has failed, we try to
-				 * reclaim space and retry.
-				 */
-				if (++tries == 1) {
-					swap_pager_reclaim();
-					goto retrygetspace;
-				}
-				rtvals[j] = VM_PAGER_AGAIN;
-				failed = 1;
-				swap_pager_full = 1;
-			} else {
-				reqaddr[j] = swb[j]->swb_block[off];
-				swb[j]->swb_valid &= ~(1 << off);
+		/*
+		 * Get biggest block of swap we can.  If we fail, fall
+		 * back and try to allocate a smaller block.  Don't go
+		 * overboard trying to allocate space if it would overly
+		 * fragment swap.
+		 */
+		while (
+		    (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
+		    n > 4
+		) {
+			n >>= 1;
+		}
+		if (blk == SWAPBLK_NONE) {
+			for (j = 0; j < n; ++j) {
+				rtvals[i+j] = VM_PAGER_FAIL;
 			}
-			splx(s);
+			grv = VM_PAGER_FAIL;
+			continue;
 		}
-	}
 
-	/*
-	 * search forwards for the last contiguous page to transfer
-	 */
-	failed = 0;
-	for (i = 0; i < count; i++) {
-		if (failed ||
-			(reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) ||
-		    ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) ||
-		    (rtvals[i] != VM_PAGER_OK)) {
-			failed = 1;
-			if (rtvals[i] == VM_PAGER_OK)
-				rtvals[i] = VM_PAGER_AGAIN;
+		/*
+		 * Oops, too big if it crosses a stripe
+		 *
+		 * 1111000000
+		 *     111111
+		 *    1000001
+		 */
+		if ((blk ^ (blk + n)) & dmmax_mask) {
+			j = ((blk + dmmax) & dmmax_mask) - blk;
+			swp_pager_freeswapspace(blk + j, n - j);
+			n = j;
 		}
-	}
 
-	ix = 0;
-	firstidx = -1;
-	for (i = 0; i < count; i++) {
-		if (rtvals[i] == VM_PAGER_OK) {
-			ix++;
-			if (firstidx == -1) {
-				firstidx = i;
-			}
-		} else if (firstidx >= 0) {
-			break;
-		}
-	}
+		/*
+		 * All I/O parameters have been satisfied, build the I/O
+		 * request and assign the swap space.
+		 *
+		 * NOTE: B_PAGING is set by pbgetvp()
+		 */
 
-	if (firstidx == -1) {
-		for (i = 0; i < count; i++) {
-			if (rtvals[i] == VM_PAGER_OK)
-				rtvals[i] = VM_PAGER_AGAIN;
-		}
-		return VM_PAGER_AGAIN;
-	}
+		bp = getpbuf(&nsw_wcount);
+		bp->b_spc = NULL;	/* not used, but NULL-out anyway */
 
-	lastidx = firstidx + ix;
+		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 
-	if (ix > max_pageout_cluster) {
-		for (i = firstidx + max_pageout_cluster; i < lastidx; i++) {
-			if (rtvals[i] == VM_PAGER_OK)
-				rtvals[i] = VM_PAGER_AGAIN;
-		}
-		ix = max_pageout_cluster;
-		lastidx = firstidx + ix;
-	}
+		bp->b_flags = B_BUSY | B_ASYNC;
+		bp->b_proc = &proc0; /* XXX (but without B_PHYS this is ok) */
+		bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 
-	for (i = 0; i < firstidx; i++) {
-		if (swb[i])
-			swb[i]->swb_locked--;
-	}
+		if (bp->b_rcred != NOCRED)
+			crhold(bp->b_rcred);
+		if (bp->b_wcred != NOCRED)
+			crhold(bp->b_wcred);
+		pbgetvp(swapdev_vp, bp);
 
-	for (i = lastidx; i < count; i++) {
-		if (swb[i])
-			swb[i]->swb_locked--;
-	}
+		bp->b_bcount = PAGE_SIZE * n;
+		bp->b_bufsize = PAGE_SIZE * n;
+		bp->b_blkno = blk;
 
-#ifdef INVARIANTS
-	for (i = firstidx; i < lastidx; i++) {
-		if (reqaddr[i] == SWB_EMPTY) {
-			printf("I/O to empty block???? -- pindex: %d, i: %d\n",
-				m[i]->pindex, i);
-		}
-	}
-#endif
+		s = splvm();
 
-	/*
-	 * Clean up all completed async pageouts.
-	 */
-	if (swap_pager_free_pending)
-		swap_pager_sync();
+		for (j = 0; j < n; ++j) {
+			vm_page_t mreq = m[i+j];
 
-	/*
-	 * get a swap pager clean data structure, block until we get it
-	 */
-	if (curproc == pageproc) {
-		if (swap_pager_free_count == 0) {
-			s = splvm();
-			while (swap_pager_free_count == 0) {
-				swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT;
-			/*
-			 * if it does not get one within a short time, then
-			 * there is a potential deadlock, so we go-on trying
-			 * to free pages.  It is important to block here as opposed
-			 * to returning, thereby allowing the pageout daemon to continue.
-			 * It is likely that pageout daemon will start suboptimally
-			 * reclaiming vnode backed pages if we don't block.  Since the
-			 * I/O subsystem is probably already fully utilized, might as
-			 * well wait.
-			 */
-				if (tsleep(&swap_pager_needflags, PVM-1, "swpfre", hz/2)) {
-					if (swap_pager_free_pending)
-						swap_pager_sync();
-					if (swap_pager_free_count == 0) {
-						for (i = firstidx; i < lastidx; i++) {
-							rtvals[i] = VM_PAGER_AGAIN;
-						}
-						splx(s);
-						return VM_PAGER_AGAIN;
-					}
-				} else {
-					swap_pager_sync();
-				}
-			}
-			splx(s);
+			swp_pager_meta_build(
+			    mreq->object, 
+			    mreq->pindex,
+			    blk + j,
+			    0
+			);
+			mreq->dirty = VM_PAGE_BITS_ALL;
+			rtvals[i+j] = VM_PAGER_OK;
+
+			vm_page_flag_set(mreq, PG_SWAPINPROG);
+			bp->b_pages[j] = mreq;
 		}
+		bp->b_flags |= B_CALL;
+		bp->b_npages = n;
 
-		spc = TAILQ_FIRST(&swap_pager_free);
-		KASSERT(spc != NULL,
-		    ("swap_pager_putpages: free queue is empty, %d expected\n",
-		    swap_pager_free_count));
-		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
-		swap_pager_free_count--;
-
-		kva = spc->spc_kva;
-		bp = spc->spc_bp;
-		bzero(bp, sizeof *bp);
-		bp->b_spc = spc;
-		bp->b_xflags = 0;
-		bp->b_data = (caddr_t) kva;
-	} else {
-		spc = NULL;
-		bp = getpbuf();
-		kva = (vm_offset_t) bp->b_data;
-		bp->b_spc = NULL;
-	}
+		cnt.v_swapout++;
+		cnt.v_swappgsout += bp->b_npages;
+		swapdev_vp->v_numoutput++;
 
-	/*
-	 * map our page(s) into kva for I/O
-	 */
-	pmap_qenter(kva, &m[firstidx], ix);
+		/*
+		 * asynchronous
+		 *
+		 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
+		 */
+
+		if (sync == FALSE) {
+			bp->b_iodone = swp_pager_async_iodone;
+			bp->b_dirtyoff = 0;
+			bp->b_dirtyend = bp->b_bcount;
+			VOP_STRATEGY(bp->b_vp, bp);
+
+			for (j = 0; j < n; ++j)
+				rtvals[i+j] = VM_PAGER_PEND;
+
+			splx(s);
+			grv = VM_PAGER_PEND;
+			continue;
+		}
 
-	/*
-	 * get the base I/O offset into the swap file
-	 */
-	for (i = firstidx; i < lastidx ; i++) {
-		fidx = m[i]->pindex + paging_pindex;
-		off = swap_pager_block_offset(fidx);
 		/*
-		 * set the valid bit
+		 * synchronous
+		 *
+		 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
 		 */
-		swb[i]->swb_valid |= (1 << off);
+
+		bp->b_iodone = swp_pager_sync_iodone;
+		VOP_STRATEGY(bp->b_vp, bp);
+
 		/*
-		 * and unlock the data structure
+		 * Wait for the sync I/O to complete, then update rtvals.
+		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
+		 * our async completion routine at the end, thus avoiding a
+		 * double-free.
 		 */
-		swb[i]->swb_locked--;
-	}
+		while ((bp->b_flags & B_DONE) == 0) {
+			tsleep(bp, PVM, "swwrt", 0);
+		}
 
-	bp->b_flags = B_BUSY | B_PAGING;
-	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
-	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
-	if (bp->b_rcred != NOCRED)
-		crhold(bp->b_rcred);
-	if (bp->b_wcred != NOCRED)
-		crhold(bp->b_wcred);
-	bp->b_blkno = reqaddr[firstidx];
-	pbgetvp(swapdev_vp, bp);
+		if (bp->b_flags & B_ERROR) {
+			grv = VM_PAGER_ERROR;
+		}
 
-	bp->b_bcount = PAGE_SIZE * ix;
-	bp->b_bufsize = PAGE_SIZE * ix;
+		for (j = 0; j < n; ++j)
+			rtvals[i+j] = VM_PAGER_PEND;
 
-	s = splvm();
-	swapdev_vp->v_numoutput++;
+		if (bp->b_flags & B_ERROR) {
+			grv = VM_PAGER_ERROR;
+		}
 
-	/*
-	 * If this is an async write we set up additional buffer fields and
-  	 * place a "cleaning" entry on the inuse queue.
-  	 */
- 	object->un_pager.swp.swp_poip++;
- 
- 	if (spc) {
-  		spc->spc_flags = 0;
-  		spc->spc_object = object;
- 		bp->b_npages = ix;
- 		for (i = firstidx; i < lastidx; i++) {
-  			spc->spc_m[i] = m[i];
- 			bp->b_pages[i - firstidx] = m[i];
- 			vm_page_protect(m[i], VM_PROT_READ);
- 			pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
- 			m[i]->dirty = 0;
- 		}
-  		spc->spc_first = firstidx;
-  		spc->spc_count = ix;
 		/*
-		 * the completion routine for async writes
+		 * Now that we are through with the bp, we can call the
+		 * normal async completion, which frees everything up.
 		 */
-		bp->b_flags |= B_CALL;
-		bp->b_iodone = swap_pager_iodone;
-		bp->b_dirtyoff = 0;
-		bp->b_dirtyend = bp->b_bcount;
-		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
-	} else {
-		bp->b_flags |= B_CALL;
-		bp->b_iodone = swap_pager_iodone1;
-		bp->b_npages = ix;
-		for (i = firstidx; i < lastidx; i++)
-			bp->b_pages[i - firstidx] = m[i];
-	}
 
-	cnt.v_swapout++;
-	cnt.v_swappgsout += ix;
+		swp_pager_async_iodone(bp);
 
-	/*
-	 * perform the I/O
-	 */
-	VOP_STRATEGY(bp->b_vp, bp);
-	if (sync == FALSE) {
-		if (swap_pager_free_pending) {
-			swap_pager_sync();
-		}
-		for (i = firstidx; i < lastidx; i++) {
-			rtvals[i] = VM_PAGER_PEND;
-		}
 		splx(s);
-		return VM_PAGER_PEND;
 	}
 
+	return(grv);
+}
+
+/*
+ *	swap_pager_sync_iodone:
+ *
+ *	Completion routine for synchronous reads and writes from/to swap.
+ *	We just mark the bp is complete and wake up anyone waiting on it.
+ *
+ *	This routine may not block.
+ */
+
+static void
+swp_pager_sync_iodone(bp)
+	struct buf *bp;
+{
+	bp->b_flags |= B_DONE;
+	bp->b_flags &= ~B_ASYNC;
+	wakeup(bp);
+}
+
+/*
+ *	swp_pager_async_iodone:
+ *
+ *	Completion routine for asynchronous reads and writes from/to swap.
+ *	Also called manually by synchronous code to finish up a bp.
+ *
+ *	WARNING!  This routine may be called from an interrupt.  We cannot
+ *	mess with swap metadata unless we want to run all our other routines
+ *	at splbio() too, which I'd rather not do.  We up ourselves
+ * 	to splvm() because we may call vm_page_free(), which can unlink a
+ *	page from an object.
+ *
+ *	XXX currently I do not believe any object routines protect 
+ *	object->memq at splvm().  The code must be gone over to determine
+ *	the actual state of the problem.
+ *
+ *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
+ *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
+ *	unbusy all pages except the 'main' request page.  For WRITE 
+ *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
+ *	because we marked them all VM_PAGER_PEND on return from putpages ).
+ *
+ *	This routine may not block.
+ *	This routine is called at splbio()
+ */
+
+static void
+swp_pager_async_iodone(bp)
+	register struct buf *bp;
+{
+	int s;
+	int i;
+	vm_object_t object = NULL;
+
+	s = splvm();
+
+	bp->b_flags |= B_DONE;
+
 	/*
-	 * wait for the sync I/O to complete
+	 * report error
 	 */
-	while ((bp->b_flags & B_DONE) == 0) {
-		tsleep(bp, PVM, "swwrt", 0);
-	}
 
 	if (bp->b_flags & B_ERROR) {
 		printf(
-"swap_pager: I/O error - pageout failed; blkno %ld, size %ld, error %d\n",
-		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
-		rv = VM_PAGER_ERROR;
-	} else {
-		rv = VM_PAGER_OK;
+		    "swap_pager: I/O error - %s failed; blkno %ld,"
+			"size %ld, error %d\n",
+		    ((bp->b_flags & B_READ) ? "pagein" : "pageout"),
+		    (long)bp->b_blkno, 
+		    (long)bp->b_bcount,
+		    bp->b_error
+		);
 	}
 
-	object->un_pager.swp.swp_poip--;
-	if (object->un_pager.swp.swp_poip == 0)
-		wakeup(object);
-
-	if (bp->b_vp)
-		pbrelvp(bp);
+	/*
+	 * set object.
+	 */
 
-	splx(s);
+	if (bp->b_npages)
+		object = bp->b_pages[0]->object;
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
-	pmap_qremove(kva, ix);
+
+	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 
 	/*
-	 * if we have written the page, then indicate that the page is clean.
+	 * cleanup pages.  If an error occurs writing to swap, we are in
+	 * very serious trouble.  If it happens to be a disk error, though,
+	 * we may be able to recover by reassigning the swap later on.  So
+	 * in this case we remove the m->swapblk assignment for the page 
+	 * but do not free it in the rlist.  The errornous block(s) are thus
+	 * never reallocated as swap.  Redirty the page and continue.
 	 */
-	if (rv == VM_PAGER_OK) {
-		for (i = firstidx; i < lastidx; i++) {
-			if (rtvals[i] == VM_PAGER_OK) {
-				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
-				m[i]->dirty = 0;
+
+	for (i = 0; i < bp->b_npages; ++i) {
+		vm_page_t m = bp->b_pages[i];
+
+		vm_page_flag_clear(m, PG_SWAPINPROG);
+
+		if (bp->b_flags & B_ERROR) {
+			/*
+			 * If an error occurs I'd love to throw the swapblk
+			 * away without freeing it back to swapspace, so it
+			 * can never be used again.  But I can't from an 
+			 * interrupt.
+			 */
+
+			if (bp->b_flags & B_READ) {
 				/*
-				 * optimization, if a page has been read
-				 * during the pageout process, we activate it.
+				 * When reading, reqpage needs to stay
+				 * locked for the parent, but all other
+				 * pages can be freed.  We still want to
+				 * wakeup the parent waiting on the page,
+				 * though.  ( also: pg_reqpage can be -1 and 
+				 * not match anything ).
+				 *
+				 * We have to wake specifically requested pages
+				 * up too because we cleared PG_SWAPINPROG and
+				 * someone may be waiting for that.
+				 *
+				 * NOTE: for reads, m->dirty will probably
+				 * be overriden by the original caller of
+				 * getpages so don't play cute tricks here.
+				 *
+				 * XXX it may not be legal to free the page
+				 * here as this messes with the object->memq's.
 				 */
-				if (((m[i]->flags & (PG_WANTED|PG_REFERENCED)) ||
-				    pmap_ts_referenced(VM_PAGE_TO_PHYS(m[i])))) {
-					vm_page_activate(m[i]);
-				}
+
+				m->valid = 0;
+				vm_page_flag_clear(m, PG_ZERO);
+
+				if (i != bp->b_pager.pg_reqpage)
+					vm_page_free(m);
+				else
+					vm_page_flash(m);
+				/*
+				 * If i == bp->b_pager.pg_reqpage, do not wake 
+				 * the page up.  The caller needs to.
+				 */
+			} else {
+				/*
+				 * If a write error occurs, reactivate page
+				 * so it doesn't clog the inactive list,
+				 * then finish the I/O.
+				 */
+				m->dirty = VM_PAGE_BITS_ALL;
+				vm_page_activate(m);
+				vm_page_io_finish(m);
 			}
-		}
-	} else {
-		for (i = firstidx; i < lastidx; i++) {
-			rtvals[i] = rv;
+		} else if (bp->b_flags & B_READ) {
+			/*
+			 * For read success, clear dirty bits.  Nobody should
+			 * have this page mapped but don't take any chances,
+			 * make sure the pmap modify bits are also cleared.
+			 *
+			 * NOTE: for reads, m->dirty will probably be 
+			 * overriden by the original caller of getpages so
+			 * we cannot set them in order to free the underlying
+			 * swap in a low-swap situation.  I don't think we'd
+			 * want to do that anyway, but it was an optimization
+			 * that existed in the old swapper for a time before
+			 * it got ripped out due to precisely this problem.
+			 *
+			 * clear PG_ZERO in page.
+			 *
+			 * If not the requested page then deactivate it.
+			 *
+			 * Note that the requested page, reqpage, is left
+			 * busied, but we still have to wake it up.  The
+			 * other pages are released (unbusied) by 
+			 * vm_page_wakeup().  We do not set reqpage's
+			 * valid bits here, it is up to the caller.
+			 */
+
+			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+			m->valid = VM_PAGE_BITS_ALL;
+			m->dirty = 0;
+			vm_page_flag_clear(m, PG_ZERO);
+
+			/*
+			 * We have to wake specifically requested pages
+			 * up too because we cleared PG_SWAPINPROG and
+			 * could be waiting for it in getpages.  However,
+			 * be sure to not unbusy getpages specifically
+			 * requested page - getpages expects it to be 
+			 * left busy.
+			 */
+			if (i != bp->b_pager.pg_reqpage) {
+				vm_page_deactivate(m);
+				vm_page_wakeup(m);
+			} else {
+				vm_page_flash(m);
+			}
+		} else {
+			/*
+			 * For write success, clear the modify and dirty 
+			 * status, then finish the I/O ( which decrements the 
+			 * busy count and possibly wakes waiter's up ).
+			 */
+			vm_page_protect(m, VM_PROT_READ);
+			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+			m->dirty = 0;
+			vm_page_io_finish(m);
 		}
 	}
 
-	if (spc != NULL) {
-		if (bp->b_rcred != NOCRED)
-			crfree(bp->b_rcred);
-		if (bp->b_wcred != NOCRED)
-			crfree(bp->b_wcred);
-		spc_free(spc);
-	} else
-		relpbuf(bp);
-	if (swap_pager_free_pending)
-		swap_pager_sync();
-
-	return (rv);
+	/*
+	 * adjust pip.  NOTE: the original parent may still have its own
+	 * pip refs on the object.
+	 */
+
+	if (object)
+		vm_object_pip_wakeupn(object, bp->b_npages);
+
+	/*
+	 * release the physical I/O buffer
+	 */
+
+	relpbuf(bp, ((bp->b_flags & B_READ) ? &nsw_rcount : &nsw_wcount));
+
+	splx(s);
 }
 
-void
-swap_pager_sync()
+/************************************************************************
+ *				SWAP META DATA 				*
+ ************************************************************************
+ *
+ *	These routines manipulate the swap metadata stored in the 
+ *	OBJT_SWAP object.
+ *
+ *	In fact, we just have a few counters in the vm_object_t.  The
+ *	metadata is actually stored in a hash table.
+ */
+
+/*
+ * SWP_PAGER_HASH() -	hash swap meta data
+ *
+ *	This is an inline helper function which hash the swapblk given
+ *	the object and page index.  It returns a pointer to a pointer
+ *	to the object, or a pointer to a NULL pointer if it could not
+ *	find a swapblk.
+ */
+
+static __inline struct swblock **
+swp_pager_hash(vm_object_t object, daddr_t index)
 {
-	swp_clean_t spc;
+	struct swblock **pswap;
+	struct swblock *swap;
+
+	index &= ~SWAP_META_MASK;
+	pswap = &swhash[(index ^ (int)(long)object) & swhash_mask];
 
-	while (spc = TAILQ_FIRST(&swap_pager_done)) {
-		swap_pager_finish(spc);
+	while ((swap = *pswap) != NULL) {
+		if (swap->swb_object == object &&
+		    swap->swb_index == index
+		) {
+			break;
+		}
+		pswap = &swap->swb_hnext;
 	}
-	return;
+	return(pswap);
 }
 
+/*
+ * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
+ *
+ *	We first convert the object to a swap object if it is a default
+ *	object.
+ *
+ *	The specified swapblk is added to the object's swap metadata.  If
+ *	the swapblk is not valid, it is freed instead.  Any previously
+ *	assigned swapblk is freed.
+ */
+
 static void
-swap_pager_finish(spc)
-	register swp_clean_t spc;
-{
-	int i, s, lastidx;
-	vm_object_t object;
-	vm_page_t *ma;
+swp_pager_meta_build(
+	vm_object_t object, 
+	daddr_t index, 
+	daddr_t swapblk, 
+	int waitok
+) {
+	struct swblock *swap;
+	struct swblock **pswap;
 
-	ma = spc->spc_m;
-	object = spc->spc_object;
-	lastidx = spc->spc_first + spc->spc_count;
+	/*
+	 * Convert default object to swap object if necessary
+	 */
 
-	s = splvm();
-	TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
-	splx(s);
+	if (object->type != OBJT_SWAP) {
+		object->type = OBJT_SWAP;
+		object->un_pager.swp.swp_bcount = 0;
+
+		if (object->handle != NULL) {
+			TAILQ_INSERT_TAIL(
+			    NOBJLIST(object->handle),
+			    object, 
+			    pager_object_list
+			);
+		} else {
+			TAILQ_INSERT_TAIL(
+			    &swap_pager_un_object_list,
+			    object, 
+			    pager_object_list
+			);
+		}
+	}
+	
+	/*
+	 * Wait for free memory when waitok is TRUE prior to calling the
+	 * zone allocator.
+	 */
 
-	pmap_qremove(spc->spc_kva, spc->spc_count);
+	while (waitok && cnt.v_free_count == 0) {
+		VM_WAIT;
+	}
 
 	/*
-	 * If no error, mark as clean and inform the pmap system. If error,
-	 * mark as dirty so we will try again. (XXX could get stuck doing
-	 * this, should give up after awhile)
+	 * If swapblk being added is invalid, just free it.
 	 */
-	if (spc->spc_flags & SPC_ERROR) {
 
-		for (i = spc->spc_first; i < lastidx; i++) {
-			printf("swap_pager_finish: I/O error, clean of page %lx failed\n",
-			    (u_long) VM_PAGE_TO_PHYS(ma[i]));
-			ma[i]->dirty = VM_PAGE_BITS_ALL;
-			vm_page_io_finish(ma[i]);
+	if (swapblk & SWAPBLK_NONE) {
+		if (swapblk != SWAPBLK_NONE) {
+			swp_pager_freeswapspace(
+			    index,
+			    1
+			);
+			swapblk = SWAPBLK_NONE;
 		}
+	}
 
-		vm_object_pip_subtract(object, spc->spc_count);
-		if ((object->paging_in_progress == 0) &&
-			(object->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(object, OBJ_PIPWNT);
-			wakeup(object);
-		}
+	/*
+	 * Locate hash entry.  If not found create, but if we aren't adding
+	 * anything just return.
+	 */
 
-	} else {
-		for (i = spc->spc_first; i < lastidx; i++) {
-			if ((ma[i]->queue != PQ_ACTIVE) &&
-			   ((ma[i]->flags & PG_WANTED) ||
-				 pmap_ts_referenced(VM_PAGE_TO_PHYS(ma[i])))) {
-				vm_page_activate(ma[i]);
-			}
-		}
+	pswap = swp_pager_hash(object, index);
+
+	if ((swap = *pswap) == NULL) {
+		int i;
+
+		if (swapblk == SWAPBLK_NONE)
+			return;
+
+		swap = *pswap = zalloc(swap_zone);
+
+		swap->swb_hnext = NULL;
+		swap->swb_object = object;
+		swap->swb_index = index & ~SWAP_META_MASK;
+		swap->swb_count = 0;
+
+		++object->un_pager.swp.swp_bcount;
+
+		for (i = 0; i < SWAP_META_PAGES; ++i)
+			swap->swb_pages[i] = SWAPBLK_NONE;
 	}
 
-	nswiodone -= spc->spc_count;
-	swap_pager_free_pending--;
-	spc_free(spc);
+	/*
+	 * Delete prior contents of metadata
+	 */
 
-	return;
+	index &= SWAP_META_MASK;
+
+	if (swap->swb_pages[index] != SWAPBLK_NONE) {
+		swp_pager_freeswapspace(
+		    swap->swb_pages[index] & SWAPBLK_MASK,
+		    1
+		);
+		--swap->swb_count;
+	}
+
+	/*
+	 * Enter block into metadata
+	 */
+
+	swap->swb_pages[index] = swapblk;
+	++swap->swb_count;
 }
 
 /*
- * swap_pager_iodone
+ * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
+ *
+ *	The requested range of blocks is freed, with any associated swap 
+ *	returned to the swap bitmap.
+ *
+ *	This routine will free swap metadata structures as they are cleaned 
+ *	out.  This routine does *NOT* operate on swap metadata associated
+ *	with resident pages.
+ *
+ *	This routine must be called at splvm()
  */
+
 static void
-swap_pager_iodone(bp)
-	register struct buf *bp;
+swp_pager_meta_free(vm_object_t object, daddr_t index, daddr_t count)
 {
-	int i, s, lastidx;
-	register swp_clean_t spc;
-	vm_object_t object;
-	vm_page_t *ma;
+	if (object->type != OBJT_SWAP)
+		return;
 
+	while (count > 0) {
+		struct swblock **pswap;
+		struct swblock *swap;
 
-	s = splvm();
-	spc = (swp_clean_t) bp->b_spc;
-	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
-	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
+		pswap = swp_pager_hash(object, index);
 
-	object = spc->spc_object;
+		if ((swap = *pswap) != NULL) {
+			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 
-#if defined(DIAGNOSTIC)
-	if (object->paging_in_progress < spc->spc_count)
-		printf("swap_pager_iodone: paging_in_progress(%d) < spc_count(%d)\n",
-			object->paging_in_progress, spc->spc_count);
-#endif
-
-	if (bp->b_flags & B_ERROR) {
-		spc->spc_flags |= SPC_ERROR;
-		printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n",
-		    (bp->b_flags & B_READ) ? "pagein" : "pageout",
-		    (u_long) bp->b_blkno, bp->b_bcount, bp->b_error);
-	} else {
-		vm_object_pip_subtract(object, spc->spc_count);
-		if ((object->paging_in_progress == 0) &&
-			(object->flags & OBJ_PIPWNT)) {
-			vm_object_clear_flag(object, OBJ_PIPWNT);
-			wakeup(object);
-		}
-		ma = spc->spc_m;
-		lastidx = spc->spc_first + spc->spc_count;
-		for (i = spc->spc_first; i < lastidx; i++) {
-			/*
-			 * we wakeup any processes that are waiting on these pages.
-			 */
-			vm_page_io_finish(ma[i]);
+			if (v != SWAPBLK_NONE) {
+				swp_pager_freeswapspace(v, 1);
+				swap->swb_pages[index & SWAP_META_MASK] =
+					SWAPBLK_NONE;
+				if (--swap->swb_count == 0) {
+					*pswap = swap->swb_hnext;
+					zfree(swap_zone, swap);
+					--object->un_pager.swp.swp_bcount;
+				}
+			}
+			--count;
+			++index;
+		} else {
+			daddr_t n = SWAP_META_PAGES - (index & SWAP_META_MASK);
+			count -= n;
+			index += n;
 		}
 	}
+}
+
+/*
+ * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
+ *
+ *	This routine locates and destroys all swap metadata associated with
+ *	an object.
+ */
+
+static void
+swp_pager_meta_free_all(vm_object_t object)
+{
+	daddr_t index = 0;
 
-	if (bp->b_vp)
-		pbrelvp(bp);
+	if (object->type != OBJT_SWAP)
+		return;
 
-	if (bp->b_rcred != NOCRED)
-		crfree(bp->b_rcred);
-	if (bp->b_wcred != NOCRED)
-		crfree(bp->b_wcred);
+	while (object->un_pager.swp.swp_bcount) {
+		struct swblock **pswap;
+		struct swblock *swap;
 
-	nswiodone += spc->spc_count;
-	swap_pager_free_pending++;
-	if (--spc->spc_object->un_pager.swp.swp_poip == 0) {
-		wakeup(spc->spc_object);
-	}
+		pswap = swp_pager_hash(object, index);
+		if ((swap = *pswap) != NULL) {
+			int i;
 
-	if (swap_pager_needflags &&
-	  ((swap_pager_free_count + swap_pager_free_pending) > (npendingio / 2))) {
-		spc_wakeup();
+			for (i = 0; i < SWAP_META_PAGES; ++i) {
+				daddr_t v = swap->swb_pages[i];
+				if (v != SWAPBLK_NONE) {
+#if !defined(MAX_PERF)
+					--swap->swb_count;
+#endif
+					swp_pager_freeswapspace(
+					    v,
+					    1
+					);
+				}
+			}
+#if !defined(MAX_PERF)
+			if (swap->swb_count != 0)
+				panic("swap_pager_meta_free_all: swb_count != 0");
+#endif
+			*pswap = swap->swb_hnext;
+			zfree(swap_zone, swap);
+			--object->un_pager.swp.swp_bcount;
+		}
+		index += SWAP_META_PAGES;
+#if !defined(MAX_PERF)
+		if (index > 0x20000000)
+			panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
+#endif
 	}
+}
 
-	if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) &&
-		vm_pageout_pages_needed) {
-		wakeup(&vm_pageout_pages_needed);
-		vm_pageout_pages_needed = 0;
+/*
+ * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
+ *
+ *	This routine is capable of looking up, popping, or freeing
+ *	swapblk assignments in the swap meta data or in the vm_page_t.
+ *	The routine typically returns the swapblk being looked-up, or popped,
+ *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
+ *	was invalid.  This routine will automatically free any invalid 
+ *	meta-data swapblks.
+ *
+ *	It is not possible to store invalid swapblks in the swap meta data
+ *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
+ *
+ *	When acting on a busy resident page and paging is in progress, we 
+ *	have to wait until paging is complete but otherwise can act on the 
+ *	busy page.
+ *
+ *	SWM_FREE	remove and free swap block from metadata
+ *
+ *	SWM_POP		remove from meta data but do not free.. pop it out
+ */
+
+static daddr_t
+swp_pager_meta_ctl(
+	vm_object_t object,
+	vm_pindex_t index,
+	int flags
+) {
+	/*
+	 * The meta data only exists of the object is OBJT_SWAP 
+	 * and even then might not be allocated yet.
+	 */
+
+	if (
+	    object->type != OBJT_SWAP ||
+	    object->un_pager.swp.swp_bcount == 0
+	) {
+		return(SWAPBLK_NONE);
 	}
 
-	splx(s);
+	{
+		struct swblock **pswap;
+		struct swblock *swap;
+		daddr_t r1 = SWAPBLK_NONE;
+
+		pswap = swp_pager_hash(object, index);
+
+		index &= SWAP_META_MASK;
+
+		if ((swap = *pswap) != NULL) {
+			r1 = swap->swb_pages[index];
+
+			if (r1 != SWAPBLK_NONE) {
+				if (flags & SWM_FREE) {
+					swp_pager_freeswapspace(
+					    r1,
+					    1
+					);
+					r1 = SWAPBLK_NONE;
+				}
+				if (flags & (SWM_FREE|SWM_POP)) {
+					swap->swb_pages[index] = SWAPBLK_NONE;
+					if (--swap->swb_count == 0) {
+						*pswap = swap->swb_hnext;
+						zfree(swap_zone, swap);
+						--object->un_pager.swp.swp_bcount;
+					}
+				} 
+	 		}
+		}
+
+		return(r1);
+	}
+	/* not reached */
 }
+
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index ceb88b6..374223c 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)swap_pager.h	7.1 (Berkeley) 12/5/90
- *	$Id: swap_pager.h,v 1.21 1998/04/29 04:28:02 dyson Exp $
+ *	$Id: swap_pager.h,v 1.22 1998/07/10 21:50:17 alex Exp $
  */
 
 /*
@@ -59,26 +59,50 @@
 #define SWB_NPAGES 8
 #endif
 
+/*
+ * Piecemeal swap metadata structure.  Swap is stored in a radix tree.
+ *
+ * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix
+ * is basically 8.  Assuming PAGE_SIZE == 4096, one tree level represents
+ * 32K worth of data, two levels represent 256K, three levels represent
+ * 2 MBytes.   This is acceptable.
+ *
+ * Overall memory utilization is about the same as the old swap structure.
+ */
+
+#define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
+
+#define SWAP_META_PAGES		(SWB_NPAGES * 2)
+#define SWAP_META_MASK		(SWAP_META_PAGES - 1)
+
 struct swblock {
-	unsigned short swb_valid;	/* bitmask for valid pages */
-	unsigned short swb_locked;	/* block locked */
-	daddr_t swb_block[SWB_NPAGES];
+	struct swblock	*swb_hnext;
+	vm_object_t	swb_object;
+	int		swb_index;
+	int		swb_count;
+	daddr_t		swb_pages[SWAP_META_PAGES];
 };
-typedef struct swblock *sw_blk_t;
 
 #ifdef KERNEL
 extern struct pagerlst swap_pager_un_object_list;
 extern int swap_pager_full;
-extern struct rlisthdr swaplist;
+extern struct blist *swapblist;
+
+int swap_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
+boolean_t swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex, int *before, int *after));
 
-int swap_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));    
 int swap_pager_swp_alloc __P((vm_object_t, int));
-void swap_pager_copy __P((vm_object_t, vm_pindex_t, vm_object_t,
-	vm_pindex_t, vm_pindex_t, int));
+void swap_pager_copy __P((vm_object_t, vm_object_t, vm_pindex_t, int));
 void swap_pager_freespace __P((vm_object_t, vm_pindex_t, vm_size_t));
 void swap_pager_dmzspace __P((vm_object_t, vm_pindex_t, vm_size_t));
 void swap_pager_swap_init __P((void));
-void swap_pager_sync __P((void));
+
+/*
+ * newswap functions
+ */
+
+void swap_pager_page_removed __P((vm_page_t, vm_object_t));
+
 #endif
 
 #endif				/* _SWAP_PAGER_ */
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index e3d64f9..d0f4754 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -66,7 +66,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_fault.c,v 1.92 1999/01/08 17:31:24 eivind Exp $
+ * $Id: vm_fault.c,v 1.93 1999/01/10 01:58:28 eivind Exp $
  */
 
 /*
@@ -114,7 +114,7 @@ struct faultstate {
 	struct vnode *vp;
 };
 
-static void
+static __inline void
 release_page(struct faultstate *fs)
 {
 	vm_page_wakeup(fs->m);
@@ -122,7 +122,7 @@ release_page(struct faultstate *fs)
 	fs->m = NULL;
 }
 
-static void
+static __inline void
 unlock_map(struct faultstate *fs)
 {
 	if (fs->lookup_still_valid) {
@@ -263,36 +263,43 @@ RetryFault:;
 	fs.object = fs.first_object;
 	fs.pindex = fs.first_pindex;
 
-	/*
-	 * See whether this page is resident
-	 */
 	while (TRUE) {
+		/*
+		 * If the object is dead, we stop here
+		 */
 
 		if (fs.object->flags & OBJ_DEAD) {
 			unlock_and_deallocate(&fs);
 			return (KERN_PROTECTION_FAILURE);
 		}
+
+		/*
+		 * See if page is resident
+		 */
 			
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
 			int queue, s;
 			/*
-			 * If the page is being brought in, wait for it and
-			 * then retry.
+			 * Wait/Retry if the page is busy.  We have to do this
+			 * if the page is busy via either PG_BUSY or 
+			 * vm_page_t->busy because the vm_pager may be using
+			 * vm_page_t->busy for pageouts ( and even pageins if
+			 * it is the vnode pager ), and we could end up trying
+			 * to pagein and pageout the same page simultaniously.
+			 *
+			 * We can theoretically allow the busy case on a read
+			 * fault if the page is marked valid, but since such
+			 * pages are typically already pmap'd, putting that
+			 * special case in might be more effort then it is 
+			 * worth.  We cannot under any circumstances mess
+			 * around with a vm_page_t->busy page except, perhaps,
+			 * to pmap it.
 			 */
-			if ((fs.m->flags & PG_BUSY) ||
-				(fs.m->busy &&
-				 (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
+			if ((fs.m->flags & PG_BUSY) || fs.m->busy) {
 				unlock_things(&fs);
-				s = splvm();
-				if ((fs.m->flags & PG_BUSY) ||
-					(fs.m->busy &&
-					 (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
-					vm_page_flag_set(fs.m, PG_WANTED | PG_REFERENCED);
-					cnt.v_intrans++;
-					tsleep(fs.m, PSWP, "vmpfw", 0);
-				}
-				splx(s);
+				(void)vm_page_sleep_busy(fs.m, TRUE, "vmpfw");
+				cnt.v_intrans++;
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
@@ -302,8 +309,12 @@ RetryFault:;
 			vm_page_unqueue_nowakeup(fs.m);
 			splx(s);
 
+#if 0
 			/*
-			 * Mark page busy for other processes, and the pagedaemon.
+			 * Code removed.  In a low-memory situation (say, a
+			 * memory-bound program is running), the last thing you
+			 * do is starve reactivations for other processes.
+			 * XXX we need to find a better way.
 			 */
 			if (((queue - fs.m->pc) == PQ_CACHE) &&
 			    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
@@ -312,6 +323,13 @@ RetryFault:;
 				VM_WAIT;
 				goto RetryFault;
 			}
+#endif
+			/*
+			 * Mark page busy for other processes, and the 
+			 * pagedaemon.  If it still isn't completely valid
+			 * (readable), jump to readrest, else break-out ( we
+			 * found the page ).
+			 */
 
 			vm_page_busy(fs.m);
 			if (((fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
@@ -321,6 +339,12 @@ RetryFault:;
 
 			break;
 		}
+
+		/*
+		 * Page is not resident, If this is the search termination,
+		 * allocate a new page.
+		 */
+
 		if (((fs.object->type != OBJT_DEFAULT) &&
 				(((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired))
 		    || (fs.object == fs.first_object)) {
@@ -344,6 +368,13 @@ RetryFault:;
 		}
 
 readrest:
+		/*
+		 * Have page, but it may not be entirely valid ( or valid at
+		 * all ).   If this object is not the default, try to fault-in
+		 * the page as well as activate additional pages when
+		 * appropriate, and page-in additional pages when appropriate.
+		 */
+
 		if (fs.object->type != OBJT_DEFAULT &&
 			(((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired)) {
 			int rv;
@@ -410,13 +441,16 @@ readrest:
 			 * vm_page_t passed to the routine.  The reqpage
 			 * return value is the index into the marray for the
 			 * vm_page_t passed to the routine.
+			 *
+			 * fs.m plus the additional pages are PG_BUSY'd.
 			 */
 			faultcount = vm_fault_additional_pages(
 			    fs.m, behind, ahead, marray, &reqpage);
 
 			/*
 			 * Call the pager to retrieve the data, if any, after
-			 * releasing the lock on the map.
+			 * releasing the lock on the map.  We hold a ref on
+			 * fs.object and the pages are PG_BUSY'd.
 			 */
 			unlock_map(&fs);
 
@@ -442,7 +476,7 @@ readrest:
 				}
 
 				hardfault++;
-				break;
+				break; /* break to PAGE HAS BEEN FOUND */
 			}
 			/*
 			 * Remove the bogus page (which does not exist at this
@@ -486,8 +520,8 @@ readrest:
 			}
 		}
 		/*
-		 * We get here if the object has default pager (or unwiring) or the
-		 * pager doesn't have the page.
+		 * We get here if the object has default pager (or unwiring) 
+		 * or the pager doesn't have the page.
 		 */
 		if (fs.object == fs.first_object)
 			fs.first_m = fs.m;
@@ -518,15 +552,17 @@ readrest:
 				cnt.v_ozfod++;
 			}
 			cnt.v_zfod++;
-			break;
+			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			if (fs.object != fs.first_object) {
 				vm_object_pip_wakeup(fs.object);
 			}
+			KASSERT(fs.object != next_object, ("object loop %p", next_object));
 			fs.object = next_object;
 			vm_object_pip_add(fs.object, 1);
 		}
 	}
+
 	KASSERT((fs.m->flags & PG_BUSY) != 0,
 	    ("vm_fault: not busy after main loop"));
 
@@ -549,14 +585,15 @@ readrest:
 		 */
 
 		if (fault_type & VM_PROT_WRITE) {
-
 			/*
-			 * This allows pages to be virtually copied from a backing_object
-			 * into the first_object, where the backing object has no other
-			 * refs to it, and cannot gain any more refs.  Instead of a
-			 * bcopy, we just move the page from the backing object to the
-			 * first object.  Note that we must mark the page dirty in the
-			 * first object so that it will go out to swap when needed.
+			 * This allows pages to be virtually copied from a 
+			 * backing_object into the first_object, where the 
+			 * backing object has no other refs to it, and cannot
+			 * gain any more refs.  Instead of a bcopy, we just 
+			 * move the page from the backing object to the 
+			 * first object.  Note that we must mark the page 
+			 * dirty in the first object so that it will go out 
+			 * to swap when needed.
 			 */
 			if (map_generation == fs.map->timestamp &&
 				/*
@@ -598,11 +635,12 @@ readrest:
 				fs.first_m = NULL;
 
 				/*
-				 * grab the page and put it into the process'es object
+				 * grab the page and put it into the 
+				 * process'es object.  The page is 
+				 * automatically made dirty.
 				 */
 				vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
 				fs.first_m = fs.m;
-				fs.first_m->dirty = VM_PAGE_BITS_ALL;
 				vm_page_busy(fs.first_m);
 				fs.m = NULL;
 				cnt.v_cow_optim++;
@@ -620,7 +658,13 @@ readrest:
 				release_page(&fs);
 			}
 
+			/*
+			 * fs.object != fs.first_object due to above 
+			 * conditional
+			 */
+
 			vm_object_pip_wakeup(fs.object);
+
 			/*
 			 * Only use the new page below...
 			 */
@@ -708,9 +752,13 @@ readrest:
 		 * If the fault is a write, we know that this page is being
 		 * written NOW. This will save on the pmap_is_modified() calls
 		 * later.
+		 *
+		 * Also tell the backing pager, if any, that it should remove
+		 * any swap backing since the page is now dirty.
 		 */
 		if (fault_flags & VM_FAULT_DIRTY) {
 			fs.m->dirty = VM_PAGE_BITS_ALL;
+			vm_pager_page_unswapped(fs.m);
 		}
 	}
 
@@ -1021,8 +1069,7 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 	 * if the requested page is not available, then give up now
 	 */
 
-	if (!vm_pager_has_page(object,
-		OFF_TO_IDX(object->paging_offset) + pindex, &cbehind, &cahead)) {
+	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
 		return 0;
 	}
 
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index ec844db..0a3309d 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -59,7 +59,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_glue.c,v 1.79 1998/12/19 08:23:31 julian Exp $
+ * $Id: vm_glue.c,v 1.80 1999/01/07 21:23:50 julian Exp $
  */
 
 #include "opt_rlimit.h"
@@ -213,10 +213,19 @@ vm_fork(p1, p2, flags)
 		p1->p_vmspace->vm_refcnt++;
 	}
 
+	/*
+	 * Great, so we have a memory-heavy process and the 
+	 * entire machine comes to a screaching halt because
+	 * nobody can fork/exec anything.  What we really need
+	 * to do is fix the process swapper so it swaps out the right
+	 * processes.
+	 */
+#if 0
 	while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
 		vm_pageout_deficit += (UPAGES + VM_INITIAL_PAGEIN);
 		VM_WAIT;
 	}
+#endif
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index ea7f45b..b2e1102 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_kern.c,v 1.49 1998/08/24 08:39:37 dfr Exp $
+ * $Id: vm_kern.c,v 1.50 1998/09/04 08:06:57 dfr Exp $
  */
 
 /*
@@ -181,8 +181,9 @@ kmem_alloc(map, size)
 				VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 		if ((mem->flags & PG_ZERO) == 0)
 			vm_page_zero_fill(mem);
-		vm_page_flag_clear(mem, (PG_BUSY | PG_ZERO));
 		mem->valid = VM_PAGE_BITS_ALL;
+		vm_page_flag_clear(mem, PG_ZERO);
+		vm_page_wakeup(mem);
 	}
 
 	/*
@@ -200,6 +201,8 @@ kmem_alloc(map, size)
  *	Release a region of kernel virtual memory allocated
  *	with kmem_alloc, and return the physical pages
  *	associated with that region.
+ *
+ *	This routine may not block on kernel maps.
  */
 void
 kmem_free(map, addr, size)
@@ -252,26 +255,31 @@ kmem_suballoc(parent, min, max, size)
 }
 
 /*
- * Allocate wired-down memory in the kernel's address map for the higher
- * level kernel memory allocator (kern/kern_malloc.c).  We cannot use
- * kmem_alloc() because we may need to allocate memory at interrupt
- * level where we cannot block (canwait == FALSE).
+ *	kmem_malloc:
+ *
+ * 	Allocate wired-down memory in the kernel's address map for the higher
+ * 	level kernel memory allocator (kern/kern_malloc.c).  We cannot use
+ * 	kmem_alloc() because we may need to allocate memory at interrupt
+ * 	level where we cannot block (canwait == FALSE).
+ *
+ * 	This routine has its own private kernel submap (kmem_map) and object
+ * 	(kmem_object).  This, combined with the fact that only malloc uses
+ * 	this routine, ensures that we will never block in map or object waits.
  *
- * This routine has its own private kernel submap (kmem_map) and object
- * (kmem_object).  This, combined with the fact that only malloc uses
- * this routine, ensures that we will never block in map or object waits.
+ * 	Note that this still only works in a uni-processor environment and
+ * 	when called at splhigh().
  *
- * Note that this still only works in a uni-processor environment and
- * when called at splhigh().
+ * 	We don't worry about expanding the map (adding entries) since entries
+ * 	for wired maps are statically allocated.
  *
- * We don't worry about expanding the map (adding entries) since entries
- * for wired maps are statically allocated.
+ *	NOTE:  This routine is not supposed to block if M_NOWAIT is set, but
+ *	I have not verified that it actually does not block.
  */
 vm_offset_t
-kmem_malloc(map, size, waitflag)
+kmem_malloc(map, size, flags)
 	register vm_map_t map;
 	register vm_size_t size;
-	boolean_t waitflag;
+	int flags;
 {
 	register vm_offset_t offset, i;
 	vm_map_entry_t entry;
@@ -297,7 +305,7 @@ kmem_malloc(map, size, waitflag)
 			printf("Out of mbuf clusters - adjust NMBCLUSTERS or increase maxusers!\n");
 			return (0);
 		}
-		if (waitflag == M_WAITOK)
+		if ((flags & M_NOWAIT) == 0)
 			panic("kmem_malloc(%d): kmem_map too small: %d total allocated",
 				size, map->size);
 		return (0);
@@ -308,9 +316,19 @@ kmem_malloc(map, size, waitflag)
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	for (i = 0; i < size; i += PAGE_SIZE) {
+		/*
+		 * Note: if M_NOWAIT specified alone, allocate from 
+		 * interrupt-safe queues only (just the free list).  If 
+		 * M_ASLEEP or M_USE_RESERVE is also specified, we can also
+		 * allocate from the cache.  Neither of the latter two
+		 * flags may be specified from an interrupt since interrupts
+		 * are not allowed to mess with the cache queue.
+		 */
 retry:
 		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i),
-			(waitflag == M_NOWAIT) ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM);
+		    ((flags & (M_NOWAIT|M_ASLEEP|M_USE_RESERVE)) == M_NOWAIT) ?
+			VM_ALLOC_INTERRUPT : 
+			VM_ALLOC_SYSTEM);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
@@ -318,7 +336,7 @@ retry:
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
-			if (waitflag == M_WAITOK) {
+			if ((flags & M_NOWAIT) == 0) {
 				VM_WAIT;
 				goto retry;
 			}
@@ -330,6 +348,9 @@ retry:
 			}
 			vm_map_delete(map, addr, addr + size);
 			vm_map_unlock(map);
+			if (flags & M_ASLEEP) {
+				VM_AWAIT;
+			}
 			return (0);
 		}
 		vm_page_flag_clear(m, PG_ZERO);
@@ -359,6 +380,9 @@ retry:
 		m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
 		vm_page_wire(m);
 		vm_page_wakeup(m);
+		/*
+		 * Because this is kernel_pmap, this call will not block.
+		 */
 		pmap_enter(kernel_pmap, addr + i, VM_PAGE_TO_PHYS(m),
 			VM_PROT_ALL, 1);
 		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_REFERENCED);
@@ -369,12 +393,14 @@ retry:
 }
 
 /*
- *	kmem_alloc_wait
+ *	kmem_alloc_wait:
  *
  *	Allocates pageable memory from a sub-map of the kernel.  If the submap
  *	has no room, the caller sleeps waiting for more memory in the submap.
  *
+ *	This routine may block.
  */
+
 vm_offset_t
 kmem_alloc_wait(map, size)
 	vm_map_t map;
@@ -406,7 +432,7 @@ kmem_alloc_wait(map, size)
 }
 
 /*
- *	kmem_free_wakeup
+ *	kmem_free_wakeup:
  *
  *	Returns memory to a submap of the kernel, and wakes up any processes
  *	waiting for memory in that map.
@@ -424,11 +450,14 @@ kmem_free_wakeup(map, addr, size)
 }
 
 /*
- * Create the kernel map; insert a mapping covering kernel text, data, bss,
- * and all space allocated thus far (`boostrap' data).  The new map will thus
- * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and
- * the range between `start' and `end' as free.
+ * 	kmem_init:
+ *
+ *	Create the kernel map; insert a mapping covering kernel text, 
+ *	data, bss, and all space allocated thus far (`boostrap' data).  The 
+ *	new map will thus map the range between VM_MIN_KERNEL_ADDRESS and 
+ *	`start' as allocated, and the range between `start' and `end' as free.
  */
+
 void
 kmem_init(start, end)
 	vm_offset_t start, end;
@@ -445,3 +474,4 @@ kmem_init(start, end)
 	/* ... and ending with the completion of the above `insert' */
 	vm_map_unlock(m);
 }
+
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 829548a..f495788 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_map.c,v 1.138 1998/10/25 17:44:58 phk Exp $
+ * $Id: vm_map.c,v 1.139 1999/01/06 23:05:41 julian Exp $
  */
 
 /*
@@ -440,7 +440,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	vm_map_entry_t new_entry;
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
+#if 0
 	vm_object_t prev_object;
+#endif
 	u_char protoeflags;
 
 	if ((object != NULL) && (cow & MAP_NOFAULT)) {
@@ -514,10 +516,15 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 
 				map->size += (end - prev_entry->end);
 				prev_entry->end = end;
+#if 0
+				/*
+				 * (no longer applies)
+				 */
 				if ((cow & MAP_NOFAULT) == 0) {
 					prev_object = prev_entry->object.vm_object;
 					default_pager_convert_to_swapq(prev_object);
 				}
+#endif
 				return (KERN_SUCCESS);
 			}
 			else {
@@ -573,7 +580,12 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 		(prev_entry->end >= new_entry->start))
 		map->first_free = new_entry;
 
+#if 0
+	/*
+	 * (no longer applies)
+	 */
 	default_pager_convert_to_swapq(object);
+#endif
 	return (KERN_SUCCESS);
 }
 
@@ -1504,7 +1516,12 @@ vm_map_user_pageable(map, start, end, new_pageable)
 					entry->offset = (vm_offset_t) 0;
 
 				}
+#if 0
+				/*
+				 * (no longer applies)
+				 */
 				default_pager_convert_to_swapq(entry->object.vm_object);
+#endif
 			}
 
 			vm_map_clip_start(map, entry, start);
@@ -1695,7 +1712,12 @@ vm_map_pageable(map, start, end, new_pageable)
 							atop(entry->end - entry->start));
 						entry->offset = (vm_offset_t) 0;
 					}
+#if 0
+					/*
+					 * (no longer applies)
+					 */
 					default_pager_convert_to_swapq(entry->object.vm_object);
+#endif
 				}
 			}
 			vm_map_clip_start(map, entry, start);
@@ -2192,16 +2214,18 @@ vm_map_split(entry)
 		m = vm_page_lookup(orig_object, offidxstart + idx);
 		if (m == NULL)
 			continue;
-		if (m->flags & PG_BUSY) {
-			vm_page_flag_set(m, PG_WANTED);
-			tsleep(m, PVM, "spltwt", 0);
+
+		/*
+		 * We must wait for pending I/O to complete before we can
+		 * rename the page.
+		 */
+		if (vm_page_sleep_busy(m, TRUE, "spltwt"))
 			goto retry;
-		}
 			
 		vm_page_busy(m);
 		vm_page_protect(m, VM_PROT_NONE);
 		vm_page_rename(m, new_object, idx);
-		m->dirty = VM_PAGE_BITS_ALL;
+		/* page automatically made dirty by rename */
 		vm_page_busy(m);
 	}
 
@@ -2212,9 +2236,7 @@ vm_map_split(entry)
 		 * and destroy unneeded pages in
 		 * shadow object.
 		 */
-		swap_pager_copy(orig_object, OFF_TO_IDX(orig_object->paging_offset),
-		    new_object, OFF_TO_IDX(new_object->paging_offset),
-			offidxstart, 0);
+		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		vm_object_pip_wakeup(orig_object);
 	}
 
@@ -2670,8 +2692,13 @@ RetryLookup:;
 		vm_map_lock_downgrade(share_map);
 	}
 
+#if 0
+	/*
+	 * (no longer applies)
+	 */
 	if (entry->object.vm_object->type == OBJT_DEFAULT)
 		default_pager_convert_to_swapq(entry->object.vm_object);
+#endif
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
@@ -2781,6 +2808,10 @@ vm_uiomove(mapa, srcobject, cp, cnta, uaddra, npages)
 					vm_map_lookup_done(map, entry);
 					return 0;
 				}
+				/*
+				 * disallow busy or invalid pages, but allow
+				 * m->busy pages if they are entirely valid.
+				 */
 				if ((m->flags & PG_BUSY) ||
 					((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
 					vm_map_lookup_done(map, entry);
@@ -2856,7 +2887,7 @@ vm_uiomove(mapa, srcobject, cp, cnta, uaddra, npages)
 				 */
 				if (first_object->type == OBJT_SWAP) {
 					swap_pager_freespace(first_object,
-						OFF_TO_IDX(first_object->paging_offset),
+						0,
 						first_object->size);
 				}
 
diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index 5bc74bd..bb52f66 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_meter.c	8.4 (Berkeley) 1/4/94
- * $Id: vm_meter.c,v 1.26 1998/08/24 08:39:37 dfr Exp $
+ * $Id: vm_meter.c,v 1.27 1998/10/31 17:21:31 peter Exp $
  */
 
 #include <sys/param.h>
@@ -195,6 +195,11 @@ vmtotal SYSCTL_HANDLER_ARGS
 	for (object = TAILQ_FIRST(&vm_object_list);
 	    object != NULL;
 	    object = TAILQ_NEXT(object, object_list)) {
+		/*
+		 * devices, like /dev/mem, will badly skew our totals
+		 */
+		if (object->type == OBJT_DEVICE)
+			continue;
 		totalp->t_vm += object->size;
 		totalp->t_rm += object->resident_page_count;
 		if (object->flags & OBJ_ACTIVE) {
diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c
index ba36e41..1374dfb 100644
--- a/sys/vm/vm_mmap.c
+++ b/sys/vm/vm_mmap.c
@@ -38,7 +38,7 @@
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
- * $Id: vm_mmap.c,v 1.85 1998/12/09 20:22:21 dt Exp $
+ * $Id: vm_mmap.c,v 1.86 1999/01/06 23:05:42 julian Exp $
  */
 
 /*
@@ -71,6 +71,7 @@
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index a1477f2..86c71c8 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_object.c,v 1.137 1999/01/08 17:31:26 eivind Exp $
+ * $Id: vm_object.c,v 1.138 1999/01/10 01:58:28 eivind Exp $
  */
 
 /*
@@ -134,9 +134,12 @@ static long object_bypasses;
 static int next_index;
 static vm_zone_t obj_zone;
 static struct vm_zone obj_zone_store;
+static int object_hash_rand;
 #define VM_OBJECTS_INIT 256
 static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
+#if 0
 static int objidnumber;
+#endif
 
 void
 _vm_object_allocate(type, size, object)
@@ -152,7 +155,9 @@ _vm_object_allocate(type, size, object)
 	object->size = size;
 	object->ref_count = 1;
 	object->flags = 0;
+#if 0
 	object->id = ++objidnumber;
+#endif
 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 		vm_object_set_flag(object, OBJ_ONEMAPPING);
 	object->behavior = OBJ_NORMAL;
@@ -168,16 +173,25 @@ _vm_object_allocate(type, size, object)
 		incr = size;
 	next_index = (next_index + incr) & PQ_L2_MASK;
 	object->handle = NULL;
-	object->paging_offset = (vm_ooffset_t) 0;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
+#if 0
 	object->page_hint = NULL;
+#endif
+	/*
+	 * Try to generate a number that will spread objects out in the
+	 * hash table.  We 'wipe' new objects across the hash in 128 page
+	 * increments plus 1 more to offset it a little more by the time
+	 * it wraps around.
+	 */
+	object->hash_rand = object_hash_rand - 129;
 
 	object->last_read = 0;
 	object->generation++;
 
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	vm_object_count++;
+	object_hash_rand = object->hash_rand;
 }
 
 /*
@@ -336,25 +350,15 @@ vm_object_deallocate(object)
 
 					robject->ref_count++;
 
-			retry:
-					if (robject->paging_in_progress ||
-							object->paging_in_progress) {
+					while (
+						robject->paging_in_progress ||
+						object->paging_in_progress
+					) {
 						vm_object_pip_sleep(robject, "objde1");
-						if (robject->paging_in_progress &&
-							robject->type == OBJT_SWAP) {
-							swap_pager_sync();
-							goto retry;
-						}
-
 						vm_object_pip_sleep(object, "objde2");
-						if (object->paging_in_progress &&
-							object->type == OBJT_SWAP) {
-							swap_pager_sync();
-						}
-						goto retry;
 					}
 
-					if( robject->ref_count == 1) {
+					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
@@ -396,6 +400,7 @@ doterm:
  *	up all previously used resources.
  *
  *	The object must be locked.
+ *	This routine may block.
  */
 void
 vm_object_terminate(object)
@@ -444,13 +449,13 @@ vm_object_terminate(object)
 	/*
 	 * Now free any remaining pages. For internal objects, this also
 	 * removes them from paging queues. Don't free wired pages, just
-	 * remove them from the object.
+	 * remove them from the object. 
 	 */
 	s = splvm();
 	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 #if !defined(MAX_PERF)
 		if (p->busy || (p->flags & PG_BUSY))
-			printf("vm_object_terminate: freeing busy page\n");
+			panic("vm_object_terminate: freeing busy page %p\n", p);
 #endif
 		if (p->wire_count == 0) {
 			vm_page_busy(p);
@@ -566,9 +571,7 @@ rescan:
 		}
 
 		s = splvm();
-		while ((p->flags & PG_BUSY) || p->busy) {
-			vm_page_flag_set(p, PG_WANTED | PG_REFERENCED);
-			tsleep(p, PVM, "vpcwai", 0);
+		while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
 			if (object->generation != curgeneration) {
 				splx(s);
 				goto rescan;
@@ -763,6 +766,12 @@ vm_object_pmap_remove(object, start, end)
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
+ *
+ *	Currently, madvise() functions are limited to the default and
+ *	swap object types only, and also limited to only the unshared portions 
+ *	of a process's address space.  MADV_FREE, certainly, could never be
+ *	run on anything else.  The others are more flexible and the code could
+ *	be adjusted in the future to handle expanded cases for them.
  */
 void
 vm_object_madvise(object, pindex, count, advise)
@@ -780,22 +789,59 @@ vm_object_madvise(object, pindex, count, advise)
 
 	end = pindex + count;
 
-	for (; pindex < end; pindex += 1) {
+	/*
+	 * MADV_FREE special case - free any swap backing store (as well
+	 * as resident pages later on).
+	 */
+
+	if (advise == MADV_FREE) {
+		tobject = object;
+		tpindex = pindex;
 
+		while (
+		    (tobject->type == OBJT_DEFAULT || 
+		     tobject->type == OBJT_SWAP) &&
+		    (tobject->flags & OBJ_ONEMAPPING)
+		) {
+			if (tobject->type == OBJT_SWAP) {
+				swap_pager_freespace(tobject, tpindex, count);
+			}
+			if ((tobject = tobject->backing_object) == NULL)
+				break;
+			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
+		}
+	}
+
+	/*
+	 * Locate and adjust resident pages
+	 */
+
+	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
+
+		if (tobject->type != OBJT_DEFAULT &&
+		    tobject->type != OBJT_SWAP
+		) {
+			continue;
+		}
+
+		if ((tobject->flags & OBJ_ONEMAPPING) == 0)
+			continue;
+
 		m = vm_page_lookup(tobject, tpindex);
+
 		if (m == NULL) {
-			if (tobject->type != OBJT_DEFAULT) {
-				continue;
-			}
-				
 			tobject = tobject->backing_object;
+			if (tobject == NULL)
+				continue;
+#if 0
 			if ((tobject == NULL) || (tobject->ref_count != 1)) {
 				continue;
 			}
+#endif
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			goto shadowlookup;
 		}
@@ -805,12 +851,15 @@ shadowlookup:
 		 * we skip it.  Things can break if we mess with pages
 		 * in any of the below states.
 		 */
-		if (m->hold_count || m->wire_count ||
-			m->valid != VM_PAGE_BITS_ALL) {
+		if (
+		    m->hold_count ||
+		    m->wire_count ||
+		    m->valid != VM_PAGE_BITS_ALL
+		) {
 			continue;
 		}
 
- 		if (vm_page_sleep(m, "madvpo", &m->busy))
+ 		if (vm_page_sleep_busy(m, TRUE, "madvpo"))
   			goto relookup;
 
 		if (advise == MADV_WILLNEED) {
@@ -818,15 +867,25 @@ shadowlookup:
 		} else if (advise == MADV_DONTNEED) {
 			vm_page_deactivate(m);
 		} else if (advise == MADV_FREE) {
-			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
-			m->dirty = 0;
 			/*
-			 * Force a demand zero if attempt to read from swap.
-			 * We currently don't handle vnode files correctly,
-			 * and will reread stale contents unnecessarily.
+			 * If MADV_FREE_FORCE_FREE is defined, we attempt to
+			 * immediately free the page.  Otherwise we just 
+			 * destroy any swap backing store, mark it clean,
+			 * and stuff it into the cache.
 			 */
-			if (object->type == OBJT_SWAP)
-				swap_pager_dmzspace(tobject, m->pindex, 1);
+			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+			m->dirty = 0;
+
+#ifdef MADV_FREE_FORCE_FREE
+			if (tobject->resident_page_count > 1) {
+				vm_page_busy(m);
+				vm_page_protect(m, VM_PROT_NONE);
+				vm_page_free(m);
+			} else
+#endif
+			{
+				vm_page_cache(m);
+			}
 		}
 	}	
 }
@@ -900,8 +959,7 @@ vm_object_qcollapse(object)
 	register vm_object_t object;
 {
 	register vm_object_t backing_object;
-	register vm_pindex_t backing_offset_index, paging_offset_index;
-	vm_pindex_t backing_object_paging_offset_index;
+	register vm_pindex_t backing_offset_index;
 	vm_pindex_t new_pindex;
 	register vm_page_t p, pp;
 	register vm_size_t size;
@@ -913,27 +971,39 @@ vm_object_qcollapse(object)
 	backing_object->ref_count += 2;
 
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
-	backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset);
-	paging_offset_index = OFF_TO_IDX(object->paging_offset);
 	size = object->size;
+
 	p = TAILQ_FIRST(&backing_object->memq);
 	while (p) {
 		vm_page_t next;
 
+		/*
+		 * setup for loop.
+		 * loop if the page isn't trivial.
+		 */
+
 		next = TAILQ_NEXT(p, listq);
 		if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) ||
 		    !p->valid || p->hold_count || p->wire_count || p->busy) {
 			p = next;
 			continue;
 		}
+
+		/*
+		 * busy the page and move it from the backing store to the
+		 * parent object.
+		 */
+
 		vm_page_busy(p);
 
+		KASSERT(p->object == object, ("vm_object_qcollapse(): object mismatch"));
+
 		new_pindex = p->pindex - backing_offset_index;
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= size) {
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object,
-				    backing_object_paging_offset_index+p->pindex,
+				    p->pindex,
 				    1);
 			vm_page_protect(p, VM_PROT_NONE);
 			vm_page_free(p);
@@ -941,16 +1011,16 @@ vm_object_qcollapse(object)
 			pp = vm_page_lookup(object, new_pindex);
 			if (pp != NULL ||
 				(object->type == OBJT_SWAP && vm_pager_has_page(object,
-				    paging_offset_index + new_pindex, NULL, NULL))) {
+				    new_pindex, NULL, NULL))) {
 				if (backing_object->type == OBJT_SWAP)
 					swap_pager_freespace(backing_object,
-					    backing_object_paging_offset_index + p->pindex, 1);
+					    p->pindex, 1);
 				vm_page_protect(p, VM_PROT_NONE);
 				vm_page_free(p);
 			} else {
 				if (backing_object->type == OBJT_SWAP)
 					swap_pager_freespace(backing_object,
-					    backing_object_paging_offset_index + p->pindex, 1);
+					    p->pindex, 1);
 
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
@@ -958,7 +1028,7 @@ vm_object_qcollapse(object)
 					vm_page_protect(p, VM_PROT_NONE);
 
 				vm_page_rename(p, object, new_pindex);
-				p->dirty = VM_PAGE_BITS_ALL;
+				/* page automatically made dirty by rename */
 			}
 		}
 		p = next;
@@ -1049,9 +1119,10 @@ vm_object_collapse(object)
 			 */
 
 			while ((p = TAILQ_FIRST(&backing_object->memq)) != 0) {
-
-				new_pindex = p->pindex - backing_offset_index;
+				if (vm_page_sleep_busy(p, TRUE, "vmocol"))
+					continue;
 				vm_page_busy(p);
+				new_pindex = p->pindex - backing_offset_index;
 
 				/*
 				 * If the parent has a page here, or if this
@@ -1068,7 +1139,7 @@ vm_object_collapse(object)
 				} else {
 					pp = vm_page_lookup(object, new_pindex);
 					if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
-					    OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) {
+					    new_pindex, NULL, NULL))) {
 						vm_page_protect(p, VM_PROT_NONE);
 						vm_page_free(p);
 					} else {
@@ -1077,7 +1148,7 @@ vm_object_collapse(object)
 						else
 							vm_page_protect(p, VM_PROT_NONE);
 						vm_page_rename(p, object, new_pindex);
-						p->dirty = VM_PAGE_BITS_ALL;
+						/* page automatically made dirty by rename */
 					}
 				}
 			}
@@ -1088,52 +1159,22 @@ vm_object_collapse(object)
 
 			if (backing_object->type == OBJT_SWAP) {
 				vm_object_pip_add(backing_object, 1);
-				if (object->type == OBJT_SWAP) {
-					vm_object_pip_add(object, 1);
-					/*
-					 * copy shadow object pages into ours
-					 * and destroy unneeded pages in
-					 * shadow object.
-					 */
-					swap_pager_copy(
-					    backing_object,
-					    OFF_TO_IDX(backing_object->paging_offset),
-					    object,
-					    OFF_TO_IDX(object->paging_offset),
-					    OFF_TO_IDX(object->backing_object_offset), TRUE);
-					vm_object_pip_wakeup(object);
-				} else {
-					vm_object_pip_add(object, 1);
-					/*
-					 * move the shadow backing_object's pager data to
-					 * "object" and convert "object" type to OBJT_SWAP.
-					 */
-					object->type = OBJT_SWAP;
-					object->un_pager.swp.swp_nblocks =
-					    backing_object->un_pager.swp.swp_nblocks;
-					object->un_pager.swp.swp_allocsize =
-					    backing_object->un_pager.swp.swp_allocsize;
-					object->un_pager.swp.swp_blocks =
-					    backing_object->un_pager.swp.swp_blocks;
-					object->un_pager.swp.swp_poip =		/* XXX */
-					    backing_object->un_pager.swp.swp_poip;
-					object->paging_offset = backing_object->paging_offset + backing_offset;
-					TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
-
-					/*
-					 * Convert backing object from OBJT_SWAP to
-					 * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is
-					 * actually necessary.
-					 */
-					backing_object->type = OBJT_DEFAULT;
-					TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list);
-					/*
-					 * free unnecessary blocks
-					 */
-					swap_pager_freespace(object, 0,
-						OFF_TO_IDX(object->paging_offset));
-					vm_object_pip_wakeup(object);
-				}
+
+				/*
+				 * scrap the paging_offset junk and do a 
+				 * discrete copy.  This also removes major 
+				 * assumptions about how the swap-pager 
+				 * works from where it doesn't belong.  The
+				 * new swapper is able to optimize the
+				 * destroy-source case.
+				 */
+
+				vm_object_pip_add(object, 1);
+				swap_pager_copy(
+				    backing_object,
+				    object,
+				    OFF_TO_IDX(object->backing_object_offset), TRUE);
+				vm_object_pip_wakeup(object);
 
 				vm_object_pip_wakeup(backing_object);
 			}
@@ -1223,7 +1264,7 @@ vm_object_collapse(object)
 
 					vm_page_busy(pp);
 					if ((pp->valid == 0) &&
-				   	    !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) {
+				   	    !vm_pager_has_page(object, new_pindex, NULL, NULL)) {
 						/*
 						 * Page still needed. Can't go any
 						 * further.
@@ -1318,7 +1359,7 @@ again:
 				 * interrupt -- minimize the spl transitions
 				 */
 
- 				if (vm_page_sleep(p, "vmopar", &p->busy))
+ 				if (vm_page_sleep_busy(p, TRUE, "vmopar"))
  					goto again;
 
 				if (clean_only && p->valid) {
@@ -1349,7 +1390,7 @@ again:
 				 * The busy flags are only cleared at
 				 * interrupt -- minimize the spl transitions
 				 */
- 				if (vm_page_sleep(p, "vmopar", &p->busy))
+ 				if (vm_page_sleep_busy(p, TRUE, "vmopar"))
 					goto again;
 
 				if (clean_only && p->valid) {
@@ -1589,11 +1630,10 @@ DB_SHOW_COMMAND(object, vm_object_print_static)
 	    object, (int)object->type, (u_long)object->size,
 	    object->resident_page_count, object->ref_count, object->flags);
 	/*
-	 * XXX no %qd in kernel.  Truncate object->paging_offset and
-	 * object->backing_object_offset.
+	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
 	 */
-	db_iprintf(" sref=%d, offset=0x%lx, backing_object(%d)=(%p)+0x%lx\n",
-	    object->shadow_count, (long)object->paging_offset,
+	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
+	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (long)object->backing_object_offset);
 
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 9897393..7f54ab6 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_object.h,v 1.50 1998/08/06 08:33:19 dfr Exp $
+ * $Id: vm_object.h,v 1.51 1998/08/24 08:39:37 dfr Exp $
  */
 
 /*
@@ -81,6 +81,7 @@ typedef enum obj_type objtype_t;
  *	Types defined:
  *
  *	vm_object_t		Virtual memory object.
+ *
  */
 
 struct vm_object {
@@ -94,32 +95,49 @@ struct vm_object {
 	int ref_count;			/* How many refs?? */
 	int shadow_count;		/* how many objects that this is a shadow for */
 	int pg_color;			/* color of first page in obj */
-	int	id;					/* ID for no purpose, other than info */
+#if 0
+	int id;				/* ID for no purpose, other than info */
+#endif
+	int hash_rand;			/* vm hash table randomizer	*/
 	u_short flags;			/* see below */
 	u_short paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	u_short	behavior;		/* see below */
 	int resident_page_count;	/* number of resident pages */
-	int cache_count;			/* number of cached pages */
-	int	wire_count;			/* number of wired pages */
-	vm_ooffset_t paging_offset;	/* Offset into paging space */
+	int cache_count;		/* number of cached pages */
+	int	wire_count;		/* number of wired pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	vm_offset_t last_read;		/* last read in object -- detect seq behavior */
-	vm_page_t page_hint;		/* hint for last looked-up or allocated page */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	void *handle;
 	union {
+		/*
+		 * VNode pager
+		 *
+		 *	vnp_size - current size of file
+		 */
 		struct {
-			off_t vnp_size; /* Current size of file */
+			off_t vnp_size;
 		} vnp;
+
+		/*
+		 * Device pager
+		 *
+		 *	devp_pglist - list of allocated pages
+		 */
 		struct {
-			TAILQ_HEAD(, vm_page) devp_pglist; /* list of pages allocated */
+			TAILQ_HEAD(, vm_page) devp_pglist;
 		} devp;
+
+		/*
+		 * Swap pager
+		 *
+		 *	swp_bcount - number of swap 'swblock' metablocks, each
+		 *		     contains up to 16 swapblk assignments.
+		 *		     see vm/swap_pager.h
+		 */
 		struct {
-			int swp_nblocks;
-			int swp_allocsize;
-			struct swblock *swp_blocks;
-			short swp_poip;
+			int swp_bcount;
 		} swp;
 	} un_pager;
 };
@@ -132,7 +150,7 @@ struct vm_object {
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
 #define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
 #define	OBJ_WRITEABLE	0x0080		/* object has been made writable */
-#define OBJ_MIGHTBEDIRTY	0x0100	/* object might be dirty */
+#define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty */
 #define OBJ_CLEANING	0x0200
 #define OBJ_OPT		0x1000		/* I/O optimization */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
@@ -197,12 +215,21 @@ vm_object_pip_wakeup(vm_object_t object)
 }
 
 static __inline void
-vm_object_pip_sleep(vm_object_t object, char *waitid)
+vm_object_pip_wakeupn(vm_object_t object, int i)
 {
-	int s;
+	if (i)
+		atomic_subtract_short(&object->paging_in_progress, i);
+	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
+		vm_object_clear_flag(object, OBJ_PIPWNT);
+		wakeup(object);
+	}
+}
 
+static __inline void
+vm_object_pip_sleep(vm_object_t object, char *waitid)
+{
 	if (object->paging_in_progress) {
-		s = splvm();
+		int s = splvm();
 		if (object->paging_in_progress) {
 			vm_object_set_flag(object, OBJ_PIPWNT);
 			tsleep(object, PVM, waitid, 0);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index c953559..2f0f4bd 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
- *	$Id: vm_page.c,v 1.115 1999/01/08 17:31:27 eivind Exp $
+ *	$Id: vm_page.c,v 1.116 1999/01/10 01:58:29 eivind Exp $
  */
 
 /*
@@ -83,6 +83,7 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 static void	vm_page_queue_init __P((void));
@@ -95,7 +96,7 @@ static vm_page_t vm_page_select_cache __P((vm_object_t, vm_pindex_t));
  *	page structure.
  */
 
-static struct pglist *vm_page_buckets;	/* Array of buckets */
+static struct vm_page **vm_page_buckets; /* Array of buckets */
 static int vm_page_bucket_count;	/* How big is array? */
 static int vm_page_hash_mask;		/* Mask for hash function */
 static volatile int vm_page_bucket_generation;
@@ -162,7 +163,6 @@ static u_short vm_page_dev_bsize_chunks[] = {
 };
 
 static __inline int vm_page_hash __P((vm_object_t object, vm_pindex_t pindex));
-static int vm_page_freechk_and_unqueue __P((vm_page_t m));
 static void vm_page_free_wakeup __P((void));
 
 /*
@@ -206,7 +206,7 @@ vm_page_startup(starta, enda, vaddr)
 {
 	register vm_offset_t mapped;
 	register vm_page_t m;
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 	vm_size_t npages, page_range;
 	register vm_offset_t new_start;
 	int i;
@@ -256,24 +256,30 @@ vm_page_startup(starta, enda, vaddr)
 	 *
 	 * The number of buckets MUST BE a power of 2, and the actual value is
 	 * the next power of 2 greater than the number of physical pages in
-	 * the system.
+	 * the system.  
+	 *
+	 * We make the hash table approximately 2x the number of pages to
+	 * reduce the chain length.  This is about the same size using the 
+	 * singly-linked list as the 1x hash table we were using before 
+	 * using TAILQ but the chain length will be smaller.
 	 *
 	 * Note: This computation can be tweaked if desired.
 	 */
-	vm_page_buckets = (struct pglist *) vaddr;
+	vm_page_buckets = (struct vm_page **)vaddr;
 	bucket = vm_page_buckets;
 	if (vm_page_bucket_count == 0) {
 		vm_page_bucket_count = 1;
 		while (vm_page_bucket_count < atop(total))
 			vm_page_bucket_count <<= 1;
 	}
+	vm_page_bucket_count <<= 1;
 	vm_page_hash_mask = vm_page_bucket_count - 1;
 
 	/*
 	 * Validate these addresses.
 	 */
 
-	new_start = start + vm_page_bucket_count * sizeof(struct pglist);
+	new_start = start + vm_page_bucket_count * sizeof(struct vm_page *);
 	new_start = round_page(new_start);
 	mapped = round_page(vaddr);
 	vaddr = pmap_map(mapped, start, new_start,
@@ -283,7 +289,7 @@ vm_page_startup(starta, enda, vaddr)
 	bzero((caddr_t) mapped, vaddr - mapped);
 
 	for (i = 0; i < vm_page_bucket_count; i++) {
-		TAILQ_INIT(bucket);
+		*bucket = NULL;
 		bucket++;
 	}
 
@@ -353,13 +359,18 @@ vm_page_startup(starta, enda, vaddr)
  *
  *	NOTE:  This macro depends on vm_page_bucket_count being a power of 2.
  *	This routine may not block.
+ *
+ *	We try to randomize the hash based on the object to spread the pages
+ *	out in the hash table without it costing us too much.
  */
 static __inline int
 vm_page_hash(object, pindex)
 	vm_object_t object;
 	vm_pindex_t pindex;
 {
-	return ((((uintptr_t) object) >> 5) + (pindex >> 1)) & vm_page_hash_mask;
+	int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
+
+	return(i & vm_page_hash_mask);
 }
 
 /*
@@ -382,7 +393,7 @@ vm_page_insert(m, object, pindex)
 	register vm_object_t object;
 	register vm_pindex_t pindex;
 {
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 
 	if (m->object != NULL)
 		panic("vm_page_insert: already inserted");
@@ -399,7 +410,8 @@ vm_page_insert(m, object, pindex)
 	 */
 
 	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
-	TAILQ_INSERT_TAIL(bucket, m, hashq);
+	m->hnext = *bucket;
+	*bucket = m;
 	vm_page_bucket_generation++;
 
 	/*
@@ -407,7 +419,9 @@ vm_page_insert(m, object, pindex)
 	 */
 
 	TAILQ_INSERT_TAIL(&object->memq, m, listq);
+#if 0
 	m->object->page_hint = m;
+#endif
 	m->object->generation++;
 
 	if (m->wire_count)
@@ -417,50 +431,48 @@ vm_page_insert(m, object, pindex)
 		object->cache_count++;
 
 	/*
-	 * And show that the object has one more resident page.
+	 * show that the object has one more resident page.
 	 */
 
 	object->resident_page_count++;
 }
 
 /*
- *	vm_page_remove:		[ internal use only ]
+ *	vm_page_remove:
  *				NOTE: used by device pager as well -wfj
  *
  *	Removes the given mem entry from the object/offset-page
- *	table and the object page list.
+ *	table and the object page list, but do not invalidate/terminate
+ *	the backing store.
  *
  *	The object and page must be locked, and at splhigh.
+ *	The underlying pmap entry (if any) is NOT removed here.
  *	This routine may not block.
- *
- *	I do not think the underlying pmap entry (if any) is removed here.
  */
 
-void
+vm_object_t
 vm_page_remove(m)
-	register vm_page_t m;
+	vm_page_t m;
 {
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 	vm_object_t object;
 
 	if (m->object == NULL)
-		return;
+		return(NULL);
 
 #if !defined(MAX_PERF)
 	if ((m->flags & PG_BUSY) == 0) {
 		panic("vm_page_remove: page not busy");
 	}
 #endif
-	
-	vm_page_flag_clear(m, PG_BUSY);
-	if (m->flags & PG_WANTED) {
-		vm_page_flag_clear(m, PG_WANTED);
-		wakeup(m);
-	}
+
+	/*
+	 * Basically destroy the page.
+	 */
+
+	vm_page_wakeup(m);
 
 	object = m->object;
-	if (object->page_hint == m)
-		object->page_hint = NULL;
 
 	if (m->wire_count)
 		object->wire_count--;
@@ -469,11 +481,23 @@ vm_page_remove(m)
 		object->cache_count--;
 
 	/*
-	 * Remove from the object_object/offset hash table
+	 * Remove from the object_object/offset hash table.  The object
+	 * must be on the hash queue, we will panic if it isn't
+	 *
+	 * Note: we must NULL-out m->hnext to prevent loops in detached
+	 * buffers with vm_page_lookup().
 	 */
 
 	bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
-	TAILQ_REMOVE(bucket, m, hashq);
+	while (*bucket != m) {
+#if !defined(MAX_PERF)
+		if (*bucket == NULL)
+			panic("vm_page_remove(): page not found in hash");
+#endif
+		bucket = &(*bucket)->hnext;
+	}
+	*bucket = m->hnext;
+	m->hnext = NULL;
 	vm_page_bucket_generation++;
 
 	/*
@@ -490,6 +514,8 @@ vm_page_remove(m)
 	object->generation++;
 
 	m->object = NULL;
+
+	return(object);
 }
 
 /*
@@ -498,8 +524,14 @@ vm_page_remove(m)
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
+ *	NOTE: the code below does not lock.  It will operate properly if
+ *	an interrupt makes a change, but the generation algorithm will not 
+ *	operate properly in an SMP environment where both cpu's are able to run
+ *	kernel code simultaniously.
+ *
  *	The object must be locked.  No side effects.
  *	This routine may not block.
+ *	This is a critical path routine
  */
 
 vm_page_t
@@ -508,25 +540,29 @@ vm_page_lookup(object, pindex)
 	register vm_pindex_t pindex;
 {
 	register vm_page_t m;
-	register struct pglist *bucket;
+	register struct vm_page **bucket;
 	int generation;
 
 	/*
 	 * Search the hash table for this object/offset pair
 	 */
 
+#if 0
 	if (object->page_hint && (object->page_hint->pindex == pindex) &&
 		(object->page_hint->object == object))
 		return object->page_hint;
+#endif
 
 retry:
 	generation = vm_page_bucket_generation;
 	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
-	for (m = TAILQ_FIRST(bucket); m != NULL; m = TAILQ_NEXT(m,hashq)) {
+	for (m = *bucket; m != NULL; m = m->hnext) {
 		if ((m->object == object) && (m->pindex == pindex)) {
 			if (vm_page_bucket_generation != generation)
 				goto retry;
+#if 0
 			m->object->page_hint = m;
+#endif
 			return (m);
 		}
 	}
@@ -545,6 +581,16 @@ retry:
  *	This routine may not block.
  *
  *	Note: this routine will raise itself to splvm(), the caller need not. 
+ *
+ *	Note: swap associated with the page must be invalidated by the move.  We
+ *	      have to do this for several reasons:  (1) we aren't freeing the
+ *	      page, (2) we are dirtying the page, (3) the VM system is probably
+ *	      moving the page from object A to B, and will then later move
+ *	      the backing store from A to B and we can't have a conflict.
+ *
+ *	Note: we *always* dirty the page.  It is necessary both for the
+ *	      fact that we moved it, and because we may be invalidating
+ *	      swap.
  */
 
 void
@@ -558,6 +604,7 @@ vm_page_rename(m, new_object, new_pindex)
 	s = splvm();
 	vm_page_remove(m);
 	vm_page_insert(m, new_object, new_pindex);
+	m->dirty = VM_PAGE_BITS_ALL;
 	splx(s);
 }
 
@@ -625,6 +672,12 @@ vm_page_unqueue(m)
  *
  *	Find a page on the specified queue with color optimization.
  *
+ *	The page coloring optimization attempts to locate a page
+ *	that does not overload other nearby pages in the object in
+ *	the cpu's L1 or L2 caches.  We need this optmization because 
+ *	cpu caches tend to be physical caches, while object spaces tend 
+ *	to be virtual.
+ *
  *	This routine must be called at splvm().
  *	This routine may not block.
  */
@@ -759,7 +812,10 @@ vm_page_select_free(object, pindex, prefqueue)
 	int i,j;
 	int index, hindex;
 #endif
-	vm_page_t m, mh;
+	vm_page_t m;
+#if 0
+	vm_page_t mh;
+#endif
 	int oqueuediff;
 	struct vpgqueues *pq;
 
@@ -768,6 +824,7 @@ vm_page_select_free(object, pindex, prefqueue)
 	else
 		oqueuediff = PQ_ZERO - PQ_FREE;
 
+#if 0
 	if (mh = object->page_hint) {
 		 if (mh->pindex == (pindex - 1)) {
 			if ((mh->flags & PG_FICTITIOUS) == 0) {
@@ -785,6 +842,7 @@ vm_page_select_free(object, pindex, prefqueue)
 			}
 		}
 	}
+#endif
 
 	pq = &vm_page_queues[prefqueue];
 
@@ -857,6 +915,8 @@ vm_page_select_free(object, pindex, prefqueue)
  *	Additional special handling is required when called from an
  *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
  *	the page cache in this case.
+ *
+ *	vm_page_alloc() 
  */
 vm_page_t
 vm_page_alloc(object, pindex, page_req)
@@ -864,7 +924,7 @@ vm_page_alloc(object, pindex, page_req)
 	vm_pindex_t pindex;
 	int page_req;
 {
-	register vm_page_t m;
+	register vm_page_t m = NULL;
 	struct vpgqueues *pq;
 	vm_object_t oldobject;
 	int queue, qtype;
@@ -873,12 +933,17 @@ vm_page_alloc(object, pindex, page_req)
 	KASSERT(!vm_page_lookup(object, pindex),
 		("vm_page_alloc: page already allocated"));
 
+	/*
+	 * The pager is allowed to eat deeper into the free page list.
+	 */
+
 	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
 		page_req = VM_ALLOC_SYSTEM;
 	};
 
 	s = splvm();
 
+loop:
 	switch (page_req) {
 
 	case VM_ALLOC_NORMAL:
@@ -961,20 +1026,36 @@ vm_page_alloc(object, pindex, page_req)
 
 	queue = m->queue;
 	qtype = queue - m->pc;
-	if (qtype == PQ_ZERO)
-		vm_page_zero_count--;
+
+	/*
+	 * Cache pages must be formally freed (and doubly so with the
+	 * new pagerops functions).  We free the page and try again.
+	 *
+	 * This also has the side effect of ensuring that the minfreepage 
+	 * wall is held more tightly verses the old code.
+	 */
+
+	if (qtype == PQ_CACHE) {
+#if !defined(MAX_PERF)
+		if (m->dirty)
+			panic("found dirty cache page %p", m);
+			
+#endif
+		vm_page_busy(m);
+		vm_page_protect(m, VM_PROT_NONE);
+		vm_page_free(m);
+		goto loop;
+	}
+
 	pq = &vm_page_queues[queue];
 	TAILQ_REMOVE(pq->pl, m, pageq);
 	(*pq->cnt)--;
 	(*pq->lcnt)--;
 	oldobject = NULL;
+
 	if (qtype == PQ_ZERO) {
+		vm_page_zero_count--;
 		m->flags = PG_ZERO | PG_BUSY;
-	} else if (qtype == PQ_CACHE) {
-		oldobject = m->object;
-		vm_page_busy(m);
-		vm_page_remove(m);
-		m->flags = PG_BUSY;
 	} else {
 		m->flags = PG_BUSY;
 	}
@@ -1004,6 +1085,12 @@ vm_page_alloc(object, pindex, page_req)
 			(cnt.v_free_count < cnt.v_pageout_free_min))
 		pagedaemon_wakeup();
 
+#if 0
+	/*
+	 * (code removed - was previously a manual breakout of the act of
+	 * freeing a page from cache.  We now just call vm_page_free() on
+	 * a cache page an loop so this code no longer needs to be here)
+	 */
 	if ((qtype == PQ_CACHE) &&
 		((page_req == VM_ALLOC_NORMAL) || (page_req == VM_ALLOC_ZERO)) &&
 		oldobject && (oldobject->type == OBJT_VNODE) &&
@@ -1017,6 +1104,7 @@ vm_page_alloc(object, pindex, page_req)
 			}
 		}
 	}
+#endif
 	splx(s);
 
 	return (m);
@@ -1048,6 +1136,33 @@ vm_wait()
 }
 
 /*
+ *	vm_await:	(also see VM_AWAIT macro)
+ *
+ *	asleep on an event that will signal when free pages are available
+ *	for allocation.
+ */
+
+void
+vm_await()
+{
+	int s;
+
+	s = splvm();
+	if (curproc == pageproc) {
+		vm_pageout_pages_needed = 1;
+		asleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
+	} else {
+		if (!vm_pages_needed) {
+			vm_pages_needed++;
+			wakeup(&vm_pages_needed);
+		}
+		asleep(&cnt.v_free_count, PVM, "vmwait", 0);
+	}
+	splx(s);
+}
+
+#if 0
+/*
  *	vm_page_sleep:
  *
  *	Block until page is no longer busy.
@@ -1069,6 +1184,38 @@ vm_page_sleep(vm_page_t m, char *msg, char *busy) {
 	return slept;
 }
 
+#endif
+
+#if 0
+
+/*
+ *	vm_page_asleep:
+ *
+ *	Similar to vm_page_sleep(), but does not block.  Returns 0 if
+ *	the page is not busy, or 1 if the page is busy.
+ *
+ *	This routine has the side effect of calling asleep() if the page
+ *	was busy (1 returned).
+ */
+
+int
+vm_page_asleep(vm_page_t m, char *msg, char *busy) {
+	int slept = 0;
+	if ((busy && *busy) || (m->flags & PG_BUSY)) {
+		int s;
+		s = splvm();
+		if ((busy && *busy) || (m->flags & PG_BUSY)) {
+			vm_page_flag_set(m, PG_WANTED);
+			asleep(m, PVM, msg, 0);
+			slept = 1;
+		}
+		splx(s);
+	}
+	return slept;
+}
+
+#endif
+
 /*
  *	vm_page_activate:
  *
@@ -1111,13 +1258,49 @@ vm_page_activate(m)
  *
  * This routine may not block.
  */
-static int
-vm_page_freechk_and_unqueue(m)
-	vm_page_t m;
+static __inline void
+vm_page_free_wakeup()
 {
-	vm_object_t oldobject;
+	/*
+	 * if pageout daemon needs pages, then tell it that there are
+	 * some free.
+	 */
+	if (vm_pageout_pages_needed) {
+		wakeup(&vm_pageout_pages_needed);
+		vm_pageout_pages_needed = 0;
+	}
+	/*
+	 * wakeup processes that are waiting on memory if we hit a
+	 * high water mark. And wakeup scheduler process if we have
+	 * lots of memory. this process will swapin processes.
+	 */
+	if (vm_pages_needed &&
+		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
+		wakeup(&cnt.v_free_count);
+		vm_pages_needed = 0;
+	}
+}
 
-	oldobject = m->object;
+/*
+ *	vm_page_free_toq:
+ *
+ *	Returns the given page to the PQ_FREE or PQ_ZERO list,
+ *	disassociating it with any VM object.
+ *
+ *	Object and page must be locked prior to entry.
+ *	This routine may not block.
+ */
+
+void
+vm_page_free_toq(vm_page_t m, int queue)
+{
+	int s;
+	struct vpgqueues *pq;
+	vm_object_t object = m->object;
+
+	s = splvm();
+
+	cnt.v_tfree++;
 
 #if !defined(MAX_PERF)
 	if (m->busy || ((m->queue - m->pc) == PQ_FREE) ||
@@ -1133,11 +1316,24 @@ vm_page_freechk_and_unqueue(m)
 	}
 #endif
 
+	/*
+	 * unqueue, then remove page.  Note that we cannot destroy
+	 * the page here because we do not want to call the pager's
+	 * callback routine until after we've put the page on the
+	 * appropriate free queue.
+	 */
+
 	vm_page_unqueue_nowakeup(m);
 	vm_page_remove(m);
 
+	/*
+	 * If fictitious remove object association and
+	 * return, otherwise delay object association removal.
+	 */
+
 	if ((m->flags & PG_FICTITIOUS) != 0) {
-		return 0;
+		splx(s);
+		return;
 	}
 
 	m->valid = 0;
@@ -1156,10 +1352,17 @@ vm_page_freechk_and_unqueue(m)
 		cnt.v_wire_count--;
 	}
 
-	if (oldobject && (oldobject->type == OBJT_VNODE) &&
-		((oldobject->flags & OBJ_DEAD) == 0)) {
-		struct vnode *vp;
-		vp = (struct vnode *) oldobject->handle;
+	/*
+	 * If we've exhausted the object's resident pages we want to free
+	 * it up.
+	 */
+
+	if (object && 
+	    (object->type == OBJT_VNODE) &&
+	    ((object->flags & OBJ_DEAD) == 0)
+	) {
+		struct vnode *vp = (struct vnode *)object->handle;
+
 		if (vp && VSHOULDFREE(vp)) {
 			if ((vp->v_flag & (VTBFREE|VDOOMED|VFREE)) == 0) {
 				TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist);
@@ -1172,107 +1375,31 @@ vm_page_freechk_and_unqueue(m)
 	pmap_page_is_free(m);
 #endif
 
-	return 1;
-}
-
-/*
- * helper routine for vm_page_free and vm_page_free_zero.
- *
- * This routine may not block.
- */
-static __inline void
-vm_page_free_wakeup()
-{
-	
-/*
- * if pageout daemon needs pages, then tell it that there are
- * some free.
- */
-	if (vm_pageout_pages_needed) {
-		wakeup(&vm_pageout_pages_needed);
-		vm_pageout_pages_needed = 0;
-	}
-	/*
-	 * wakeup processes that are waiting on memory if we hit a
-	 * high water mark. And wakeup scheduler process if we have
-	 * lots of memory. this process will swapin processes.
-	 */
-	if (vm_pages_needed &&
-		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
-		wakeup(&cnt.v_free_count);
-		vm_pages_needed = 0;
-	}
-}
-
-/*
- *	vm_page_free:
- *
- *	Returns the given page to the free list,
- *	disassociating it with any VM object.
- *
- *	Object and page must be locked prior to entry.
- *	This routine may not block.
- */
-void
-vm_page_free(m)
-	register vm_page_t m;
-{
-	int s;
-	struct vpgqueues *pq;
-
-	s = splvm();
-
-	cnt.v_tfree++;
-
-	if (!vm_page_freechk_and_unqueue(m)) {
-		splx(s);
-		return;
-	}
-
-	m->queue = PQ_FREE + m->pc;
+	m->queue = queue + m->pc;
 	pq = &vm_page_queues[m->queue];
 	++(*pq->lcnt);
 	++(*pq->cnt);
-	/*
-	 * If the pageout process is grabbing the page, it is likely
-	 * that the page is NOT in the cache.  It is more likely that
-	 * the page will be partially in the cache if it is being
-	 * explicitly freed.
-	 */
-	if (curproc == pageproc) {
-		TAILQ_INSERT_TAIL(pq->pl, m, pageq);
-	} else {
-		TAILQ_INSERT_HEAD(pq->pl, m, pageq);
-	}
 
-	vm_page_free_wakeup();
-	splx(s);
-}
-
-void
-vm_page_free_zero(m)
-	register vm_page_t m;
-{
-	int s;
-	struct vpgqueues *pq;
-
-	s = splvm();
-
-	cnt.v_tfree++;
+	if (queue == PQ_ZERO) {
+		TAILQ_INSERT_HEAD(pq->pl, m, pageq);
+		++vm_page_zero_count;
+	} else {
+		/*
+		 * If the pageout process is grabbing the page, it is likely
+		 * that the page is NOT in the cache.  It is more likely that
+		 * the page will be partially in the cache if it is being
+		 * explicitly freed.
+		 */
 
-	if (!vm_page_freechk_and_unqueue(m)) {
-		splx(s);
-		return;
+		if (curproc == pageproc) {
+			TAILQ_INSERT_TAIL(pq->pl, m, pageq);
+		} else {
+			TAILQ_INSERT_HEAD(pq->pl, m, pageq);
+		}
 	}
 
-	m->queue = PQ_ZERO + m->pc;
-	pq = &vm_page_queues[m->queue];
-	++(*pq->lcnt);
-	++(*pq->cnt);
-
-	TAILQ_INSERT_HEAD(pq->pl, m, pageq);
-	++vm_page_zero_count;
 	vm_page_free_wakeup();
+
 	splx(s);
 }
 
@@ -1311,6 +1438,17 @@ vm_page_wire(m)
  *	Release one wiring of this page, potentially
  *	enabling it to be paged again.
  *
+ *	Many pages placed on the inactive queue should actually go
+ *	into the cache, but it is difficult to figure out which.  What
+ *	we do instead, if the inactive target is well met, is to put
+ *	clean pages at the head of the inactive queue instead of the tail.
+ *	This will cause them to be moved to the cache more quickly and
+ *	if not actively re-referenced, freed more quickly.  If we just
+ *	stick these pages at the end of the inactive queue, heavy filesystem
+ *	meta-data accesses can cause an unnecessary paging load on memory bound 
+ *	processes.  This optimization causes one-time-use metadata to be
+ *	reused more quickly.
+ *
  *	The page queues must be locked.
  *	This routine may not block.
  */
@@ -1351,7 +1489,8 @@ vm_page_unwire(m, activate)
 
 
 /*
- * Move the specified page to the inactive queue.
+ * Move the specified page to the inactive queue.  If the page has
+ * any associated swap, the swap is deallocated.
  *
  * This routine may not block.
  */
@@ -1383,7 +1522,8 @@ vm_page_deactivate(m)
 /*
  * vm_page_cache
  *
- * Put the specified page onto the page cache queue (if appropriate). 
+ * Put the specified page onto the page cache queue (if appropriate).
+ *
  * This routine may not block.
  */
 void
@@ -1624,7 +1764,7 @@ again1:
 				}
 
 				next = TAILQ_NEXT(m, pageq);
-				if (vm_page_sleep(m, "vpctw0", &m->busy))
+				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
 					goto again1;
 				vm_page_test_dirty(m);
 				if (m->dirty) {
@@ -1652,7 +1792,7 @@ again1:
 				}
 
 				next = TAILQ_NEXT(m, pageq);
-				if (vm_page_sleep(m, "vpctw1", &m->busy))
+				if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
 					goto again1;
 				vm_page_test_dirty(m);
 				if (m->dirty) {
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 3149391..f9e4926 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_page.h,v 1.48 1998/10/28 13:37:02 dg Exp $
+ * $Id: vm_page.h,v 1.49 1999/01/08 17:31:28 eivind Exp $
  */
 
 /*
@@ -105,10 +105,10 @@ TAILQ_HEAD(pglist, vm_page);
 
 struct vm_page {
 	TAILQ_ENTRY(vm_page) pageq;	/* queue info for FIFO queue or free list (P) */
-	TAILQ_ENTRY(vm_page) hashq;	/* hash table links (O) */
-	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) */
+	struct vm_page	*hnext;		/* hash table link (O,P)	*/
+	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) 	*/
 
-	vm_object_t object;		/* which object am I in (O,P) */
+	vm_object_t object;		/* which object am I in (O,P)*/
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_offset_t phys_addr;		/* physical address of page */
 	u_short	queue;			/* page queue index */
@@ -130,6 +130,13 @@ struct vm_page {
 };
 
 /*
+ * note SWAPBLK_NONE is a flag, basically the high bit.
+ */
+
+#define SWAPBLK_MASK	((daddr_t)((u_daddr_t)-1 >> 1))		/* mask */
+#define SWAPBLK_NONE	((daddr_t)((u_daddr_t)SWAPBLK_MASK + 1))/* flag */
+
+/*
  * Page coloring parameters
  */
 /* Each of PQ_FREE, PQ_ZERO and PQ_CACHE have PQ_HASH_SIZE entries */
@@ -201,14 +208,15 @@ extern struct vpgqueues {
  *
  * Note: PG_FILLED and PG_DIRTY are added for the filesystems.
  */
-#define	PG_BUSY		0x01		/* page is in transit (O) */
-#define	PG_WANTED	0x02		/* someone is waiting for page (O) */
-#define	PG_FICTITIOUS	0x08		/* physical page doesn't exist (O) */
-#define	PG_WRITEABLE	0x10		/* page is mapped writeable */
-#define PG_MAPPED	0x20		/* page is mapped */
-#define	PG_ZERO		0x40		/* page is zeroed */
-#define PG_REFERENCED	0x80		/* page has been referenced */
-#define PG_CLEANCHK	0x100		/* page will be checked for cleaning */
+#define	PG_BUSY		0x0001		/* page is in transit (O) */
+#define	PG_WANTED	0x0002		/* someone is waiting for page (O) */
+#define	PG_FICTITIOUS	0x0008		/* physical page doesn't exist (O) */
+#define	PG_WRITEABLE	0x0010		/* page is mapped writeable */
+#define PG_MAPPED	0x0020		/* page is mapped */
+#define	PG_ZERO		0x0040		/* page is zeroed */
+#define PG_REFERENCED	0x0080		/* page has been referenced */
+#define PG_CLEANCHK	0x0100		/* page will be checked for cleaning */
+#define PG_SWAPINPROG	0x0200		/* swap I/O in progress on page	     */
 
 /*
  * Misc constants.
@@ -307,16 +315,36 @@ vm_page_busy(vm_page_t m)
 	vm_page_flag_set(m, PG_BUSY);
 }
 
+/*
+ *	vm_page_flash:
+ *
+ *	wakeup anyone waiting for the page.
+ */
+
 static __inline void
-vm_page_wakeup(vm_page_t m)
+vm_page_flash(vm_page_t m)
 {
-	vm_page_flag_clear(m, PG_BUSY);
 	if (m->flags & PG_WANTED) {
 		vm_page_flag_clear(m, PG_WANTED);
 		wakeup(m);
 	}
 }
 
+/*
+ *	vm_page_wakeup:
+ *
+ *	clear the PG_BUSY flag and wakeup anyone waiting for the
+ *	page.
+ *
+ */
+
+static __inline void
+vm_page_wakeup(vm_page_t m)
+{
+	vm_page_flag_clear(m, PG_BUSY);
+	vm_page_flash(m);
+}
+
 static __inline void
 vm_page_io_start(vm_page_t m)
 {
@@ -327,10 +355,8 @@ static __inline void
 vm_page_io_finish(vm_page_t m)
 {
 	atomic_subtract_char(&m->busy, 1);
-	if ((m->flags & PG_WANTED) && m->busy == 0) {
-		vm_page_flag_clear(m, PG_WANTED);
-		wakeup(m);
-	}
+	if (m->busy == 0)
+		vm_page_flash(m);
 }
 
 
@@ -353,12 +379,13 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
+static __inline void vm_page_free __P((vm_page_t));
+static __inline void vm_page_free_zero __P((vm_page_t));
+void vm_page_destroy __P((vm_page_t));
 void vm_page_deactivate __P((vm_page_t));
-void vm_page_free __P((vm_page_t));
-void vm_page_free_zero __P((vm_page_t));
 void vm_page_insert __P((vm_page_t, vm_object_t, vm_pindex_t));
 vm_page_t vm_page_lookup __P((vm_object_t, vm_pindex_t));
-void vm_page_remove __P((vm_page_t));
+vm_object_t vm_page_remove __P((vm_page_t));
 void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t));
 vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t));
 void vm_page_unwire __P((vm_page_t, int));
@@ -374,7 +401,11 @@ int vm_page_bits __P((int, int));
 vm_page_t vm_page_list_find __P((int, int));
 int vm_page_queue_index __P((vm_offset_t, int));
 vm_page_t vm_page_select __P((vm_object_t, vm_pindex_t, int));
+#if 0
 int vm_page_sleep(vm_page_t m, char *msg, char *busy);
+int vm_page_asleep(vm_page_t m, char *msg, char *busy);
+#endif
+void vm_page_free_toq(vm_page_t m, int queue);
 
 /*
  * Keep page from being freed by the page daemon
@@ -438,5 +469,64 @@ vm_page_copy(src_m, dest_m)
 	dest_m->valid = VM_PAGE_BITS_ALL;
 }
 
+/*
+ *	vm_page_free:
+ *
+ *	Free a page
+ */
+static __inline void
+vm_page_free(m)
+	vm_page_t m;
+{
+	vm_page_free_toq(m, PQ_FREE);
+}
+
+/*
+ *	vm_page_free_zero:
+ *
+ *	Free a page to the zerod-pages queue
+ */
+static __inline void
+vm_page_free_zero(m)
+	vm_page_t m;
+{
+	vm_page_free_toq(m, PQ_ZERO);
+}
+
+/*
+ *	vm_page_sleep_busy:
+ *
+ *	Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE)
+ *	m->busy is zero.  Returns TRUE if it had to sleep ( including if 
+ *	it almost had to sleep and made temporary spl*() mods), FALSE 
+ *	otherwise.
+ *
+ *	This routine assumes that interrupts can only remove the busy
+ *	status from a page, not set the busy status or change it from
+ *	PG_BUSY to m->busy or vise versa (which would create a timing
+ *	window).
+ *
+ *	Note that being an inline, this code will be well optimized.
+ */
+
+static __inline int
+vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
+{
+	if ((m->flags & PG_BUSY) || (also_m_busy && m->busy))  {
+		int s = splvm();
+		if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) {
+			/*
+			 * Page is busy. Wait and retry.
+			 */
+			vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
+			tsleep(m, PVM, msg, 0);
+		}
+		splx(s);
+		return(TRUE);
+		/* not reached */
+	}
+	return(FALSE);
+}
+
 #endif				/* KERNEL */
 #endif				/* !_VM_PAGE_ */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 606981f..06f24d6 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -65,7 +65,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pageout.c,v 1.128 1998/10/25 17:44:59 phk Exp $
+ * $Id: vm_pageout.c,v 1.129 1998/10/31 17:21:31 peter Exp $
  */
 
 /*
@@ -211,13 +211,10 @@ void pmap_collect(void);
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
- * block.
- * 
- * And we set pageout-in-progress to keep the object from disappearing
- * during pageout.  This guarantees that the page won't move from the
- * inactive queue.  (However, any other page on the inactive queue may
- * move!)
+ * block.  Note the careful timing, however, the busy bit isn't set till
+ * late and we cannot do anything that will mess with the page.
  */
+
 static int
 vm_pageout_clean(m)
 	vm_page_t m;
@@ -231,12 +228,23 @@ vm_pageout_clean(m)
 	object = m->object;
 
 	/*
+	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
+	 * with the new swapper, but we could have serious problems paging
+	 * out other object types if there is insufficient memory.  
+	 *
+	 * Unfortunately, checking free memory here is far too late, so the
+	 * check has been moved up a procedural level.
+	 */
+
+#if 0
+	/*
 	 * If not OBJT_SWAP, additional memory may be needed to do the pageout.
 	 * Try to avoid the deadlock.
 	 */
 	if ((object->type == OBJT_DEFAULT) &&
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min))
 		return 0;
+#endif
 
 	/*
 	 * Don't mess with the page if it's busy.
@@ -245,12 +253,21 @@ vm_pageout_clean(m)
 	    ((m->busy != 0) || (m->flags & PG_BUSY)))
 		return 0;
 
+#if 0
+	/*
+	 * XXX REMOVED XXX.  vm_object_collapse() can block, which can
+	 * change the page state.  Calling vm_object_collapse() might also
+	 * destroy or rename the page because we have not busied it yet!!!
+	 * So this code segment is removed.
+	 */
 	/*
-	 * Try collapsing before it's too late.
+	 * Try collapsing before it's too late.   XXX huh?  Why are we doing
+	 * this here?
 	 */
 	if (object->backing_object) {
 		vm_object_collapse(object);
 	}
+#endif
 
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
@@ -351,6 +368,16 @@ do_backward:
 	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
 }
 
+/*
+ * vm_pageout_flush() - launder the given pages
+ *
+ *	The given pages are laundered.  Note that we setup for the start of
+ *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
+ *	reference count all in here rather then in the parent.  If we want
+ *	the parent to do more sophisticated things we may have to change
+ *	the ordering.
+ */
+
 int
 vm_pageout_flush(mc, count, flags)
 	vm_page_t *mc;
@@ -362,6 +389,14 @@ vm_pageout_flush(mc, count, flags)
 	int numpagedout = 0;
 	int i;
 
+	/*
+	 * Initiate I/O.  Bump the vm_page_t->busy counter and
+	 * mark the pages read-only.
+	 *
+	 * We do not have to fixup the clean/dirty bits here... we can
+	 * allow the pager to do it after the I/O completes.
+	 */
+
 	for (i = 0; i < count; i++) {
 		vm_page_io_start(mc[i]);
 		vm_page_protect(mc[i], VM_PROT_READ);
@@ -585,25 +620,24 @@ vm_pageout_map_deactivate_pages(map, desired)
 }
 #endif
 
+/*
+ * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
+ * to vnode deadlocks.  We only do it for OBJT_DEFAULT and OBJT_SWAP objects
+ * which we know can be trivially freed.
+ */
+
 void
 vm_pageout_page_free(vm_page_t m) {
-	struct vnode *vp;
-	vm_object_t object;
-
-	object = m->object;
-	object->ref_count++;
-
-	if (object->type == OBJT_VNODE) {
-		vp = object->handle;
-		vp->v_usecount++;
-		if (VSHOULDBUSY(vp))
-			vbusy(vp);
-	}
+	vm_object_t object = m->object;
+	int type = object->type;
 
+	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
+		vm_object_reference(object);
 	vm_page_busy(m);
 	vm_page_protect(m, VM_PROT_NONE);
 	vm_page_free(m);
-	vm_object_deallocate(object);
+	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
+		vm_object_deallocate(object);
 }
 
 /*
@@ -613,9 +647,10 @@ static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
-	int page_shortage, addl_page_shortage, maxscan, pcount;
+	int page_shortage, maxscan, pcount;
+	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
-	int pages_freed;
+	int launder_loop = 0;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
@@ -629,31 +664,53 @@ vm_pageout_scan()
 	 */
 	pmap_collect();
 
-	/*
-	 * Start scanning the inactive queue for pages we can free. We keep
-	 * scanning until we have enough free pages or we have scanned through
-	 * the entire queue.  If we encounter dirty pages, we start cleaning
-	 * them.
-	 */
-
-	pages_freed = 0;
-	addl_page_shortage = vm_pageout_deficit;
+	addl_page_shortage_init = vm_pageout_deficit;
 	vm_pageout_deficit = 0;
 
 	if (max_page_launder == 0)
 		max_page_launder = 1;
-	maxlaunder = (cnt.v_inactive_target > max_page_launder) ?
-	    max_page_launder : cnt.v_inactive_target;
 
-rescan0:
-	maxscan = cnt.v_inactive_count;
-	for( m = TAILQ_FIRST(&vm_page_queue_inactive);
+	/*
+	 * Calculate the number of pages we want to either free or move
+	 * to the cache.
+	 */
+
+	page_shortage = (cnt.v_free_target + cnt.v_cache_min) -
+	    (cnt.v_free_count + cnt.v_cache_count);
+	page_shortage += addl_page_shortage_init;
+
+	/*
+	 * Figure out what to do with dirty pages when they are encountered.
+	 * Assume that 1/3 of the pages on the inactive list are clean.  If
+	 * we think we can reach our target, disable laundering (do not
+	 * clean any dirty pages).  If we miss the target we will loop back
+	 * up and do a laundering run.
+	 */
 
-		(m != NULL) && (maxscan-- > 0) &&
-			((cnt.v_cache_count + cnt.v_free_count) <
-			(cnt.v_cache_min + cnt.v_free_target));
+	if (cnt.v_inactive_count / 3 > page_shortage) {
+		maxlaunder = 0;
+		launder_loop = 0;
+	} else {
+		maxlaunder = 
+		    (cnt.v_inactive_target > max_page_launder) ?
+		    max_page_launder : cnt.v_inactive_target;
+		launder_loop = 1;
+	}
 
-		m = next) {
+	/*
+	 * Start scanning the inactive queue for pages we can move to the
+	 * cache or free.  The scan will stop when the target is reached or
+	 * we have scanned the entire inactive queue.
+	 */
+
+rescan0:
+	addl_page_shortage = addl_page_shortage_init;
+	maxscan = cnt.v_inactive_count;
+	for (
+	    m = TAILQ_FIRST(&vm_page_queue_inactive);
+	    m != NULL && maxscan-- > 0 && page_shortage > 0;
+	    m = next
+	) {
 
 		cnt.v_pdpages++;
 
@@ -681,19 +738,21 @@ rescan0:
 		}
 
 		/*
-		 * If the object is not being used, we ignore previous references.
+		 * If the object is not being used, we ignore previous 
+		 * references.
 		 */
 		if (m->object->ref_count == 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 
 		/*
-		 * Otherwise, if the page has been referenced while in the inactive
-		 * queue, we bump the "activation count" upwards, making it less
-		 * likely that the page will be added back to the inactive queue
-		 * prematurely again.  Here we check the page tables (or emulated
-		 * bits, if any), given the upper level VM system not knowing anything
-		 * about existing references.
+		 * Otherwise, if the page has been referenced while in the 
+		 * inactive queue, we bump the "activation count" upwards, 
+		 * making it less likely that the page will be added back to 
+		 * the inactive queue prematurely again.  Here we check the 
+		 * page tables (or emulated bits, if any), given the upper 
+		 * level VM system not knowing anything about existing 
+		 * references.
 		 */
 		} else if (((m->flags & PG_REFERENCED) == 0) &&
 			(actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m)))) {
@@ -703,10 +762,10 @@ rescan0:
 		}
 
 		/*
-		 * If the upper level VM system knows about any page references,
-		 * we activate the page.  We also set the "activation count" higher
-		 * than normal so that we will less likely place pages back onto the
-		 * inactive queue again.
+		 * If the upper level VM system knows about any page 
+		 * references, we activate the page.  We also set the 
+		 * "activation count" higher than normal so that we will less 
+		 * likely place pages back onto the inactive queue again.
 		 */
 		if ((m->flags & PG_REFERENCED) != 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
@@ -717,9 +776,10 @@ rescan0:
 		}
 
 		/*
-		 * If the upper level VM system doesn't know anything about the
-		 * page being dirty, we have to check for it again.  As far as the
-		 * VM code knows, any partially dirty pages are fully dirty.
+		 * If the upper level VM system doesn't know anything about 
+		 * the page being dirty, we have to check for it again.  As 
+		 * far as the VM code knows, any partially dirty pages are 
+		 * fully dirty.
 		 */
 		if (m->dirty == 0) {
 			vm_page_test_dirty(m);
@@ -733,14 +793,14 @@ rescan0:
 		if (m->valid == 0) {
 			vm_pageout_page_free(m);
 			cnt.v_dfree++;
-			pages_freed++;
+			--page_shortage;
 
 		/*
 		 * Clean pages can be placed onto the cache queue.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
-			pages_freed++;
+			--page_shortage;
 
 		/*
 		 * Dirty pages need to be paged out.  Note that we clean
@@ -763,8 +823,8 @@ rescan0:
 			}
 
 			/*
-			 * We don't bother paging objects that are "dead".  Those
-			 * objects are in a "rundown" state.
+			 * We don't bother paging objects that are "dead".  
+			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				s = splvm();
@@ -774,10 +834,61 @@ rescan0:
 				continue;
 			}
 
-			if ((object->type == OBJT_VNODE) &&
-				(object->flags & OBJ_DEAD) == 0) {
+			/*
+			 * For now we protect against potential memory
+			 * deadlocks by requiring significant memory to be 
+			 * free if the object is not OBJT_DEFAULT or OBJT_SWAP.
+			 * We do not 'trust' any other object type to operate
+			 * with low memory, not even OBJT_DEVICE.  The VM
+			 * allocator will special case allocations done by
+			 * the pageout daemon so the check below actually 
+			 * does have some hysteresis in it.  It isn't the best
+			 * solution, though.
+			 */
+
+			if (
+			    object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    cnt.v_free_count < cnt.v_free_reserved
+			) {
+				s = splvm();
+				TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
+				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
+				splx(s);
+				continue;
+			}
+
+			/*
+			 * Presumably we have sufficient free memory to do
+			 * the more sophisticated checks and locking required
+			 * for vnodes.
+			 *
+			 * The object is already known NOT to be dead.  The
+			 * vget() may still block, though, because 
+			 * VOP_ISLOCKED() doesn't check to see if an inode
+			 * (v_data) is associated with the vnode.  If it isn't,
+			 * vget() will load in it from disk.  Worse, vget()
+			 * may actually get stuck waiting on "inode" if another
+			 * process is in the process of bringing the inode in.
+			 * This is bad news for us either way.
+			 *
+			 * So for the moment we check v_data == NULL as a
+			 * workaround.  This means that vnodes which do not
+			 * use v_data in the way we expect probably will not
+			 * wind up being paged out by the pager and it will be
+			 * up to the syncer to get them.  That's better then
+			 * us blocking here.
+			 *
+			 * This whole code section is bogus - we need to fix
+			 * the vnode pager to handle vm_page_t's without us
+			 * having to do any sophisticated VOP tests.
+			 */
+
+			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
+
 				if (VOP_ISLOCKED(vp) ||
+				    vp->v_data == NULL ||
 				    vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
 					if ((m->queue == PQ_INACTIVE) &&
 						(m->hold_count == 0) &&
@@ -844,19 +955,34 @@ rescan0:
 	}
 
 	/*
-	 * Compute the page shortage.  If we are still very low on memory be
-	 * sure that we will move a minimal amount of pages from active to
-	 * inactive.
+	 * If we still have a page shortage and we didn't launder anything,
+	 * run the inactive scan again and launder something this time.
+	 */
+
+	if (launder_loop == 0 && page_shortage > 0) {
+		launder_loop = 1;
+		maxlaunder = 
+		    (cnt.v_inactive_target > max_page_launder) ?
+		    max_page_launder : cnt.v_inactive_target;
+		goto rescan0;
+	}
+
+	/*
+	 * Compute the page shortage from the point of view of having to
+	 * move pages from the active queue to the inactive queue.
 	 */
+
 	page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 	page_shortage += addl_page_shortage;
-	if (page_shortage <= 0) {
-		page_shortage = 0;
-	}
+
+	/*
+	 * Scan the active queue for things we can deactivate
+	 */
 
 	pcount = cnt.v_active_count;
 	m = TAILQ_FIRST(&vm_page_queue_active);
+
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
 		/*
@@ -943,10 +1069,14 @@ rescan0:
 	}
 
 	s = splvm();
+
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
-	 * code to be guaranteed space.
+	 * code to be guaranteed space.  Since both cache and free queues 
+	 * are considered basically 'free', moving pages from cache to free
+	 * does not effect other calculations.
 	 */
+
 	while (cnt.v_free_count < cnt.v_free_reserved) {
 		static int cache_rover = 0;
 		m = vm_page_list_find(PQ_CACHE, cache_rover);
@@ -995,7 +1125,6 @@ rescan0:
 #endif
 	}
 
-
 	/*
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
@@ -1242,10 +1371,8 @@ vm_pageout()
 			cnt.v_pdwakeups++;
 		vm_pages_needed = 0;
 		splx(s);
-		vm_pager_sync();
 		vm_pageout_scan();
 		vm_pageout_deficit = 0;
-		vm_pager_sync();
 		wakeup(&cnt.v_free_count);
 	}
 }
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index a864896..68c0561 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pageout.h,v 1.22 1998/01/12 01:44:46 dyson Exp $
+ * $Id: vm_pageout.h,v 1.23 1998/01/22 17:30:43 dyson Exp $
  */
 
 #ifndef _VM_VM_PAGEOUT_H_
@@ -100,7 +100,9 @@ extern int vm_pageout_deficit;
 
 extern void pagedaemon_wakeup __P((void));
 #define VM_WAIT vm_wait()
+#define VM_AWAIT vm_await()
 extern void vm_wait __P((void));
+extern void vm_await __P((void));
 
 #ifdef KERNEL
 void vm_pageout_page __P((vm_page_t, vm_object_t));
diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c
index 18df05d..62fe6e8 100644
--- a/sys/vm/vm_pager.c
+++ b/sys/vm/vm_pager.c
@@ -61,7 +61,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pager.c,v 1.39 1998/10/31 15:31:29 peter Exp $
+ * $Id: vm_pager.c,v 1.40 1998/11/10 09:16:27 peter Exp $
  */
 
 /*
@@ -91,6 +91,8 @@ extern struct pagerops swappagerops;
 extern struct pagerops vnodepagerops;
 extern struct pagerops devicepagerops;
 
+int cluster_pbuf_freecnt = -1;	/* unlimited to begin with */
+
 static int dead_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static vm_object_t dead_pager_alloc __P((void *, vm_ooffset_t, vm_prot_t,
 	vm_ooffset_t));
@@ -164,14 +166,15 @@ struct pagerops deadpagerops = {
 	NULL
 };
 
-static struct pagerops *pagertab[] = {
+struct pagerops *pagertab[] = {
 	&defaultpagerops,	/* OBJT_DEFAULT */
 	&swappagerops,		/* OBJT_SWAP */
 	&vnodepagerops,		/* OBJT_VNODE */
 	&devicepagerops,	/* OBJT_DEVICE */
 	&deadpagerops		/* OBJT_DEAD */
 };
-static int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
+
+int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
 
 /*
  * Kernel address space for mapping pages.
@@ -217,6 +220,8 @@ vm_pager_bufferinit()
 		bp->b_xflags = 0;
 	}
 
+	cluster_pbuf_freecnt = nswbuf / 2;
+
 	swapbkva = kmem_alloc_pageable(pager_map, nswbuf * MAXPHYS);
 	if (!swapbkva)
 		panic("Not enough pager_map VM space for physical buffers");
@@ -246,41 +251,21 @@ vm_pager_deallocate(object)
 	(*pagertab[object->type]->pgo_dealloc) (object);
 }
 
+/*
+ * vm_pager_get_pages() - inline, see vm/vm_pager.h
+ * vm_pager_put_pages() - inline, see vm/vm_pager.h
+ * vm_pager_has_page() - inline, see vm/vm_pager.h
+ * vm_pager_page_inserted() - inline, see vm/vm_pager.h
+ * vm_pager_page_removed() - inline, see vm/vm_pager.h
+ */
 
-int
-vm_pager_get_pages(object, m, count, reqpage)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int reqpage;
-{
-	return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
-}
-
-int
-vm_pager_put_pages(object, m, count, flags, rtvals)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int flags;
-	int *rtvals;
-{
-	return ((*pagertab[object->type]->pgo_putpages)(object, m, count, flags, rtvals));
-}
-
-boolean_t
-vm_pager_has_page(object, offset, before, after)
-	vm_object_t object;
-	vm_pindex_t offset;
-	int *before;
-	int *after;
-{
-	return ((*pagertab[object->type]->pgo_haspage) (object, offset, before, after));
-}
-
+#if 0
 /*
- * Called by pageout daemon before going back to sleep.
- * Gives pagers a chance to clean up any completed async pageing operations.
+ *	vm_pager_sync:
+ *
+ *	Called by pageout daemon before going back to sleep.
+ *	Gives pagers a chance to clean up any completed async pageing 
+ *	operations.
  */
 void
 vm_pager_sync()
@@ -292,6 +277,8 @@ vm_pager_sync()
 			(*(*pgops)->pgo_sync) ();
 }
 
+#endif
+
 vm_offset_t
 vm_pager_map_page(m)
 	vm_page_t m;
@@ -342,20 +329,42 @@ initpbuf(struct buf *bp) {
 
 /*
  * allocate a physical buffer
+ *
+ *	There are a limited number (nswbuf) of physical buffers.  We need
+ *	to make sure that no single subsystem is able to hog all of them,
+ *	so each subsystem implements a counter which is typically initialized
+ *	to 1/2 nswbuf.  getpbuf() decrements this counter in allocation and
+ *	increments it on release, and blocks if the counter hits zero.  A
+ *	subsystem may initialize the counter to -1 to disable the feature,
+ *	but it must still be sure to match up all uses of getpbuf() with 
+ *	relpbuf() using the same variable.
+ *
+ *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
+ *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 struct buf *
-getpbuf()
+getpbuf(pfreecnt)
+	int *pfreecnt;
 {
 	int s;
 	struct buf *bp;
 
 	s = splvm();
+
+	if (pfreecnt) {
+		while (*pfreecnt == 0) {
+			tsleep(pfreecnt, PVM, "wswbuf0", 0);
+		}
+	}
+
 	/* get a bp from the swap buffer header pool */
 	while ((bp = TAILQ_FIRST(&bswlist)) == NULL) {
 		bswneeded = 1;
-		tsleep(&bswneeded, PVM, "wswbuf", 0);
+		tsleep(&bswneeded, PVM, "wswbuf1", 0);
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
+	if (pfreecnt)
+		--*pfreecnt;
 	splx(s);
 
 	initpbuf(bp);
@@ -363,20 +372,27 @@ getpbuf()
 }
 
 /*
- * allocate a physical buffer, if one is available
+ * allocate a physical buffer, if one is available.
+ *
+ *	Note that there is no NULL hack here - all subsystems using this
+ *	call understand how to use pfreecnt.
  */
 struct buf *
-trypbuf()
+trypbuf(pfreecnt)
+	int *pfreecnt;
 {
 	int s;
 	struct buf *bp;
 
 	s = splvm();
-	if ((bp = TAILQ_FIRST(&bswlist)) == NULL) {
+	if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) {
 		splx(s);
 		return NULL;
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
+
+	--*pfreecnt;
+
 	splx(s);
 
 	initpbuf(bp);
@@ -386,10 +402,14 @@ trypbuf()
 
 /*
  * release a physical buffer
+ *
+ *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
+ *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 void
-relpbuf(bp)
+relpbuf(bp, pfreecnt)
 	struct buf *bp;
+	int *pfreecnt;
 {
 	int s;
 
@@ -403,6 +423,7 @@ relpbuf(bp)
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
+
 	if (bp->b_vp)
 		pbrelvp(bp);
 
@@ -415,5 +436,9 @@ relpbuf(bp)
 		bswneeded = 0;
 		wakeup(&bswneeded);
 	}
+	if (pfreecnt) {
+		if (++*pfreecnt == 1)
+			wakeup(pfreecnt);
+	}
 	splx(s);
 }
diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h
index 6b8eb42..0e8d894 100644
--- a/sys/vm/vm_pager.h
+++ b/sys/vm/vm_pager.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $Id: vm_pager.h,v 1.16 1998/03/07 21:37:27 dyson Exp $
+ * $Id: vm_pager.h,v 1.17 1998/10/13 08:24:44 dg Exp $
  */
 
 /*
@@ -57,7 +57,7 @@ struct pagerops {
 	int (*pgo_getpages) __P((vm_object_t, vm_page_t *, int, int));	/* Get (read) page. */
 	int (*pgo_putpages) __P((vm_object_t, vm_page_t *, int, int, int *)); /* Put (write) page. */
 	boolean_t (*pgo_haspage) __P((vm_object_t, vm_pindex_t, int *, int *)); /* Does pager have page? */
-	void (*pgo_sync) __P((void));
+	void (*pgo_pageunswapped) __P((vm_page_t));
 };
 
 /*
@@ -87,20 +87,69 @@ MALLOC_DECLARE(M_VMPGDATA);
 
 extern vm_map_t pager_map;
 extern int pager_map_size;
+extern struct pagerops *pagertab[];
 
 vm_object_t vm_pager_allocate __P((objtype_t, void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t));
 void vm_pager_bufferinit __P((void));
 void vm_pager_deallocate __P((vm_object_t));
-int vm_pager_get_pages __P((vm_object_t, vm_page_t *, int, int));
-boolean_t vm_pager_has_page __P((vm_object_t, vm_pindex_t, int *, int *));
+static __inline int vm_pager_get_pages __P((vm_object_t, vm_page_t *, int, int));
+static __inline boolean_t vm_pager_has_page __P((vm_object_t, vm_pindex_t, int *, int *));
 void vm_pager_init __P((void));
 vm_object_t vm_pager_object_lookup __P((struct pagerlst *, void *));
 vm_offset_t vm_pager_map_pages __P((vm_page_t *, int, boolean_t));
 vm_offset_t vm_pager_map_page __P((vm_page_t));
-int vm_pager_put_pages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
+static __inline int vm_pager_put_pages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
 void vm_pager_sync __P((void));
 void vm_pager_unmap_pages __P((vm_offset_t, int));
 void vm_pager_unmap_page __P((vm_offset_t));
+
+static __inline int
+vm_pager_get_pages(
+	vm_object_t object,
+	vm_page_t *m,
+	int count,
+	int reqpage
+) {
+	return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage));
+}
+
+static __inline int
+vm_pager_put_pages(
+	vm_object_t object,
+	vm_page_t *m,
+	int count,
+	int flags,
+	int *rtvals
+) {
+	return ((*pagertab[object->type]->pgo_putpages)(object, m, count, flags, rtvals));
+}
+
+static __inline boolean_t
+vm_pager_has_page(
+	vm_object_t object,
+	vm_pindex_t offset, 
+	int *before,
+	int *after
+) {
+        return ((*pagertab[object->type]->pgo_haspage) (object, offset, before, after));
+} 
+
+/* 
+ *      vm_pager_page_unswapped
+ * 
+ *      called at splvm() to destroy swap associated with the page.
+ * 
+ *      This function may not block.
+ */
+ 
+static __inline void
+vm_pager_page_unswapped(vm_page_t m)
+{
+	if (pagertab[m->object->type]->pgo_pageunswapped)
+		(*pagertab[m->object->type]->pgo_pageunswapped)(m);
+}
+
+
 #endif
 
 #endif				/* _VM_PAGER_ */
diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c
index bfcebdc..f973631 100644
--- a/sys/vm/vm_swap.c
+++ b/sys/vm/vm_swap.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
- * $Id: vm_swap.c,v 1.56 1998/07/04 22:30:26 julian Exp $
+ * $Id: vm_swap.c,v 1.57 1998/10/25 19:24:04 bde Exp $
  */
 
 #include "opt_devfs.h"
@@ -50,7 +50,7 @@
 #include <sys/dmap.h>		/* XXX */
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
-#include <sys/rlist.h>
+#include <sys/blist.h>
 #include <sys/kernel.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -94,8 +94,7 @@ static dev_t	swapdev = makedev(BDEV_MAJOR, 0);
 static struct swdevt should_be_malloced[NSWAPDEV];
 static struct swdevt *swdevt = should_be_malloced;
 struct vnode *swapdev_vp;
-/* XXX swapinfo(8) needs this one I belive */
-int nswap;			/* first block after the interleaved devs */
+static int nswap;		/* first block after the interleaved devs */
 static int nswdev = NSWAPDEV;
 int vm_swap_size;
 
@@ -119,7 +118,13 @@ swstrategy(bp)
 	register struct swdevt *sp;
 	struct vnode *vp;
 
-	sz = howmany(bp->b_bcount, DEV_BSIZE);
+	sz = howmany(bp->b_bcount, PAGE_SIZE);
+	/*
+	 * Convert interleaved swap into per-device swap.  Note that
+	 * the block size is left in PAGE_SIZE'd chunks (for the newswap)
+	 * here.
+	 */
+
 	if (nswdev > 1) {
 		off = bp->b_blkno % dmmax;
 		if (off + sz > dmmax) {
@@ -132,8 +137,9 @@ swstrategy(bp)
 		index = seg % nswdev;
 		seg /= nswdev;
 		bp->b_blkno = seg * dmmax + off;
-	} else
+	} else {
 		index = 0;
+	}
 	sp = &swdevt[index];
 	if (bp->b_blkno + sz > sp->sw_nblks) {
 		bp->b_error = EINVAL;
@@ -148,6 +154,12 @@ swstrategy(bp)
 		biodone(bp);
 		return;
 	}
+
+	/*
+	 * Convert from PAGE_SIZE'd to DEV_BSIZE'd chunks for the actual I/O
+	 */
+	bp->b_blkno = ctodb(bp->b_blkno);
+
 	vhold(sp->sw_vp);
 	s = splvm();
 	if ((bp->b_flags & B_READ) == 0) {
@@ -161,10 +173,8 @@ swstrategy(bp)
 		}
 		sp->sw_vp->v_numoutput++;
 	}
-	if (bp->b_vp != NULL)
-		pbrelvp(bp);
+	pbreassignbuf(bp, sp->sw_vp);
 	splx(s);
-	bp->b_vp = sp->sw_vp;
 	VOP_STRATEGY(bp->b_vp, bp);
 }
 
@@ -240,6 +250,11 @@ swapon(p, uap)
  * Each of the nswdev devices provides 1/nswdev'th of the swap
  * space, which is laid out with blocks of dmmax pages circularly
  * among the devices.
+ *
+ * The new swap code uses page-sized blocks.  The old swap code used
+ * DEV_BSIZE'd chunks.
+ *
+ * XXX locking when multiple swapon's run in parallel
  */
 int
 swaponvp(p, vp, dev, nblks)
@@ -277,18 +292,37 @@ swaponvp(p, vp, dev, nblks)
 		(void) VOP_CLOSE(vp, FREAD | FWRITE, p->p_ucred, p);
 		return (ENXIO);
 	}
+	/*
+	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
+	 * First chop nblks off to page-align it, then convert.
+	 * 
+	 * sw->sw_nblks is in page-sized chunks now too.
+	 */
+	nblks &= ~(ctodb(1) - 1);
+	nblks = dbtoc(nblks);
+
 	sp->sw_vp = vp;
 	sp->sw_dev = dev;
 	sp->sw_flags |= SW_FREED;
 	sp->sw_nblks = nblks;
 
+	/*
+	 * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not
+	 * DEV_BSIZE'd. 
+	 */
+
 	if (nblks * nswdev > nswap)
 		nswap = (nblks+1) * nswdev;
 
+	if (swapblist == NULL)
+		swapblist = blist_create(nswap);
+	else
+		blist_resize(&swapblist, nswap, 0);
+
 	for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
-		blk = min(nblks - dvbase,dmmax);
+		blk = min(nblks - dvbase, dmmax);
 		vsbase = index * dmmax + dvbase * nswdev;
-		rlist_free(&swaplist, vsbase, vsbase + blk - 1);
+		blist_free(swapblist, vsbase, blk);
 		vm_swap_size += blk;
 	}
 
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index fba7e2f..fe04da4 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
- *	$Id: vnode_pager.c,v 1.100 1998/10/13 08:24:44 dg Exp $
+ *	$Id: vnode_pager.c,v 1.101 1998/12/04 18:39:44 rvb Exp $
  */
 
 /*
@@ -88,6 +88,8 @@ struct pagerops vnodepagerops = {
 	NULL
 };
 
+int vnode_pbuf_freecnt = -1;	/* start out unlimited */
+
 
 /*
  * Allocate (or lookup) pager for a vnode.
@@ -106,6 +108,13 @@ vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 	if (handle == NULL)
 		return (NULL);
 
+	/*
+	 * XXX hack - This initialization should be put somewhere else.
+	 */
+	if (vnode_pbuf_freecnt < 0) {
+	    vnode_pbuf_freecnt = nswbuf / 2 + 1;
+	}
+
 	vp = (struct vnode *) handle;
 
 	/*
@@ -395,7 +404,7 @@ vnode_pager_input_smlfs(object, m)
 		fileaddr = vnode_pager_addr(vp,
 			IDX_TO_OFF(m->pindex) + i * bsize, (int *)0);
 		if (fileaddr != -1) {
-			bp = getpbuf();
+			bp = getpbuf(&vnode_pbuf_freecnt);
 
 			/* build a minimal buffer header */
 			bp->b_flags = B_BUSY | B_READ | B_CALL;
@@ -428,7 +437,7 @@ vnode_pager_input_smlfs(object, m)
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
-			relpbuf(bp);
+			relpbuf(bp, &vnode_pbuf_freecnt);
 			if (error)
 				break;
 
@@ -707,7 +716,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 	if (dp->v_type == VBLK || dp->v_type == VCHR)
 		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 
-	bp = getpbuf();
+	bp = getpbuf(&vnode_pbuf_freecnt);
 	kva = (vm_offset_t) bp->b_data;
 
 	/*
@@ -755,7 +764,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 	/*
 	 * free the buffer header back to the swap buffer pool
 	 */
-	relpbuf(bp);
+	relpbuf(bp, &vnode_pbuf_freecnt);
 
 	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
 		vm_page_t mt;