diff options
63 files changed, 3198 insertions, 2062 deletions
diff --git a/sys/alpha/alpha/pmap.c b/sys/alpha/alpha/pmap.c index 0e7aa73..fe8741d 100644 --- a/sys/alpha/alpha/pmap.c +++ b/sys/alpha/alpha/pmap.c @@ -43,7 +43,7 @@ * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp * with some ideas from NetBSD's alpha pmap - * $Id: pmap.c,v 1.11 1998/10/21 11:38:06 dg Exp $ + * $Id: pmap.c,v 1.12 1998/10/28 13:36:49 dg Exp $ */ /* @@ -950,7 +950,7 @@ pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) vm_page_t m; retry: m = vm_page_lookup(object, pindex); - if (m && vm_page_sleep(m, "pplookp", NULL)) + if (m && vm_page_sleep_busy(m, FALSE, "pplookp")) goto retry; return m; } @@ -1039,7 +1039,7 @@ pmap_dispose_proc(p) if ((m = vm_page_lookup(upobj, i)) == NULL) panic("pmap_dispose_proc: upage already missing???"); - vm_page_flag_set(m, PG_BUSY); + vm_page_busy(m); oldpte = *(ptek + i); *(ptek + i) = 0; @@ -1128,7 +1128,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { int s; - while (vm_page_sleep(m, "pmuwpt", NULL)); + while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) + ; if (m->hold_count == 0) { vm_offset_t pteva; @@ -1181,7 +1182,7 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) wakeup(m); } - vm_page_flag_set(m, PG_BUSY); + vm_page_busy(m); vm_page_free_zero(m); --cnt.v_wire_count; } @@ -1316,10 +1317,10 @@ pmap_release_free_page(pmap_t pmap, vm_page_t p) * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ - if (vm_page_sleep(p, "pmaprl", NULL)) + if (vm_page_sleep_busy(p, FALSE, "pmaprl")) return 0; - vm_page_flag_set(p, PG_BUSY); + vm_page_busy(p); /* * Remove the page table page from the processes address space. @@ -2336,7 +2337,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); - vm_page_flag_set(p, PG_BUSY); + vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + alpha_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); @@ -2356,7 +2357,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); - vm_page_flag_set(p, PG_BUSY); + vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + alpha_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); @@ -2453,7 +2454,7 @@ pmap_prefault(pmap, addra, entry) if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } - vm_page_flag_set(m, PG_BUSY); + vm_page_busy(m); mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); vm_page_flag_set(m, PG_MAPPED); diff --git a/sys/alpha/alpha/symbols.raw b/sys/alpha/alpha/symbols.raw index bf8881a..2b03da9 100644 --- a/sys/alpha/alpha/symbols.raw +++ b/sys/alpha/alpha/symbols.raw @@ -1,6 +1,6 @@ # @(#)symbols.raw 7.6 (Berkeley) 5/8/91 # -# $Id: symbols.raw,v 1.12 1998/03/30 09:48:20 phk Exp $ +# $Id: symbols.raw,v 1.1 1998/06/10 10:53:25 dfr Exp $ # @@ -34,7 +34,8 @@ #pstat # _cons _nswap - _swaplist + _swapblist +# _swaplist #vmstat _cp_time # _rate diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 66c9b63..2a378d3 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -39,7 +39,7 @@ * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.218 1999/01/09 21:41:22 dt Exp $ + * $Id: pmap.c,v 1.219 1999/01/12 00:17:53 eivind Exp $ */ /* @@ -942,7 +942,7 @@ pmap_page_lookup(object, pindex) vm_page_t m; retry: m = vm_page_lookup(object, pindex); - if (m && vm_page_sleep(m, "pplookp", NULL)) + if (m && vm_page_sleep_busy(m, FALSE, "pplookp")) goto retry; return m; } @@ -1009,8 +1009,8 @@ pmap_new_proc(p) } vm_page_wakeup(m); - m->flags &= ~PG_ZERO; - m->flags |= PG_MAPPED | PG_WRITEABLE; + vm_page_flag_clear(m, PG_ZERO); + vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); m->valid = VM_PAGE_BITS_ALL; } if (updateneeded) @@ -1038,7 +1038,7 @@ pmap_dispose_proc(p) if ((m = vm_page_lookup(upobj, i)) == NULL) panic("pmap_dispose_proc: upage already missing???"); - m->flags |= PG_BUSY; + vm_page_busy(m); oldpte = *(ptek + i); *(ptek + i) = 0; @@ -1107,7 +1107,7 @@ pmap_swapin_proc(p) vm_page_wire(m); vm_page_wakeup(m); - m->flags |= PG_MAPPED | PG_WRITEABLE; + vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); } } @@ -1122,7 +1122,8 @@ pmap_swapin_proc(p) static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { - while (vm_page_sleep(m, "pmuwpt", NULL)); + while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) + ; if (m->hold_count == 0) { vm_offset_t pteva; @@ -1150,12 +1151,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { --m->wire_count; if (m->wire_count == 0) { - if (m->flags & PG_WANTED) { - m->flags &= ~PG_WANTED; - wakeup(m); - } - - m->flags |= PG_BUSY; + vm_page_flash(m); + vm_page_busy(m); vm_page_free_zero(m); --cnt.v_wire_count; } @@ -1257,7 +1254,8 @@ pmap_pinit(pmap) ptdpg->wire_count = 1; ++cnt.v_wire_count; - ptdpg->flags &= ~(PG_MAPPED | PG_BUSY); /* not mapped normally */ + + vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); @@ -1290,10 +1288,10 @@ pmap_release_free_page(pmap, p) * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ - if (vm_page_sleep(p, "pmaprl", NULL)) + if (vm_page_sleep_busy(p, FALSE, "pmaprl")) return 0; - p->flags |= PG_BUSY; + vm_page_busy(p); /* * Remove the page table page from the processes address space. @@ -1393,8 +1391,9 @@ _pmap_allocpte(pmap, ptepindex) } m->valid = VM_PAGE_BITS_ALL; - m->flags &= ~(PG_ZERO | PG_BUSY); - m->flags |= PG_MAPPED; + vm_page_flag_clear(m, PG_ZERO); + vm_page_flag_set(m, PG_MAPPED); + vm_page_wakeup(m); return m; } @@ -1713,7 +1712,7 @@ pmap_remove_entry(pmap, ppv, va) TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; if (TAILQ_FIRST(&ppv->pv_list) == NULL) - ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); + vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); @@ -1791,7 +1790,7 @@ pmap_remove_pte(pmap, ptq, va) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (oldpte & PG_A) - ppv->pv_vm_page->flags |= PG_REFERENCED; + vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED); return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); @@ -1976,7 +1975,7 @@ pmap_remove_all(pa) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) - ppv->pv_vm_page->flags |= PG_REFERENCED; + vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. @@ -2005,7 +2004,7 @@ pmap_remove_all(pa) free_pv_entry(pv); } - ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); + vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); if (update_needed) invltlb(); @@ -2081,7 +2080,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) ppv = NULL; if (pbits & PG_A) { ppv = pa_to_pvh(pbits); - ppv->pv_vm_page->flags |= PG_REFERENCED; + vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED); pbits &= ~PG_A; } if (pbits & PG_M) { @@ -2436,7 +2435,7 @@ pmap_object_init_pt(pmap, addr, object, pindex, size, limit) retry: p = vm_page_lookup(object, pindex); - if (p && vm_page_sleep(p, "init4p", NULL)) + if (p && vm_page_sleep_busy(p, FALSE, "init4p")) goto retry; if (p == NULL) { @@ -2469,7 +2468,7 @@ retry: ptepa += NBPDR; ptepindex += 1; } - p->flags |= PG_MAPPED; + vm_page_flag_set(p, PG_MAPPED); invltlb(); return; } @@ -2510,11 +2509,11 @@ retry: (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); - p->flags |= PG_BUSY; + vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); - p->flags |= PG_MAPPED; + vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } objpgs -= 1; @@ -2531,11 +2530,11 @@ retry: (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); - p->flags |= PG_BUSY; + vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); - p->flags |= PG_MAPPED; + vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } } @@ -2628,10 +2627,10 @@ pmap_prefault(pmap, addra, entry) if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } - m->flags |= PG_BUSY; + vm_page_busy(m); mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); - m->flags |= PG_MAPPED; + vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); } } @@ -3026,7 +3025,7 @@ pmap_remove_pages(pmap, sva, eva) ppv->pv_list_count--; TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { - ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); + vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); @@ -3406,7 +3405,7 @@ pmap_mincore(pmap, addr) */ else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) { val |= MINCORE_REFERENCED_OTHER; - m->flags |= PG_REFERENCED; + vm_page_flag_set(m, PG_REFERENCED); } } return val; diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c index 57ac533..e4be47f 100644 --- a/sys/cam/cam_periph.c +++ b/sys/cam/cam_periph.c @@ -26,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: cam_periph.c,v 1.8 1998/12/16 21:00:06 ken Exp $ + * $Id: cam_periph.c,v 1.9 1999/01/14 06:21:54 jdp Exp $ */ #include <sys/param.h> @@ -599,7 +599,7 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) /* * Get the buffer. */ - mapinfo->bp[i] = getpbuf(); + mapinfo->bp[i] = getpbuf(NULL); /* save the buffer's data address */ mapinfo->bp[i]->b_saveaddr = mapinfo->bp[i]->b_data; @@ -674,7 +674,7 @@ cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) mapinfo->bp[i]->b_flags &= ~(B_PHYS|B_BUSY); /* release the buffer */ - relpbuf(mapinfo->bp[i]); + relpbuf(mapinfo->bp[i], NULL); } /* allow ourselves to be swapped once again */ diff --git a/sys/conf/files b/sys/conf/files index 795f6f8..02a281b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -306,6 +306,7 @@ kern/subr_module.c standard kern/subr_prf.c standard kern/subr_prof.c standard kern/subr_rlist.c standard +kern/subr_blist.c standard kern/subr_scanf.c standard kern/subr_xxx.c standard kern/sys_generic.c standard diff --git a/sys/conf/options b/sys/conf/options index 35ceb1a..6dfc0cc 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -1,4 +1,4 @@ -# $Id: options,v 1.120 1999/01/17 19:02:39 peter Exp $ +# $Id: options,v 1.121 1999/01/20 14:49:07 eivind Exp $ # # On the handling of kernel options # @@ -209,6 +209,7 @@ TCPDEBUG IPFILTER opt_ipfilter.h IPFILTER_LOG opt_ipfilter.h IPFILTER_LKM opt_ipfilter.h +SLIP_IFF_OPTS opt_slip.h # ATM (HARP version) ATM_CORE opt_atm.h diff --git a/sys/fs/procfs/procfs_map.c b/sys/fs/procfs/procfs_map.c index 4dae10a..c6b8966 100644 --- a/sys/fs/procfs/procfs_map.c +++ b/sys/fs/procfs/procfs_map.c @@ -36,7 +36,7 @@ * * @(#)procfs_status.c 8.3 (Berkeley) 2/17/94 * - * $Id: procfs_map.c,v 1.17 1998/04/29 04:28:22 dyson Exp $ + * $Id: procfs_map.c,v 1.18 1998/12/04 22:54:51 archie Exp $ */ #include <sys/param.h> @@ -93,7 +93,7 @@ procfs_domap(curp, p, pfs, uio) ((uio->uio_resid > 0) && (entry != &map->header)); entry = entry->next) { vm_object_t obj, tobj, lobj; - int ref_count, shadow_count, id, flags; + int ref_count, shadow_count, flags; vm_offset_t addr; int resident, privateresident; char *type; @@ -139,13 +139,11 @@ case OBJT_DEVICE: flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; - id = obj->id; } else { type = "none"; flags = 0; ref_count = 0; shadow_count = 0; - id = 0; } @@ -154,9 +152,9 @@ case OBJT_DEVICE: * start, end, resident, private resident, cow, access, type. */ snprintf(mebuffer, sizeof(mebuffer), - "0x%x 0x%x %d %d %d %s%s%s %d %d 0x%x %s %s %s\n", + "0x%x 0x%x %d %d %p %s%s%s %d %d 0x%x %s %s %s\n", entry->start, entry->end, - resident, privateresident, id, + resident, privateresident, obj, (entry->protection & VM_PROT_READ)?"r":"-", (entry->protection & VM_PROT_WRITE)?"w":"-", (entry->protection & VM_PROT_EXECUTE)?"x":"-", diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c index ff0f347..6096a1b 100644 --- a/sys/fs/specfs/spec_vnops.c +++ b/sys/fs/specfs/spec_vnops.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 - * $Id: spec_vnops.c,v 1.77 1998/12/07 21:58:33 archie Exp $ + * $Id: spec_vnops.c,v 1.78 1998/12/16 00:10:51 eivind Exp $ */ #include <sys/param.h> @@ -781,7 +781,7 @@ spec_getpages(ap) blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); - bp = getpbuf(); + bp = getpbuf(NULL); kva = (vm_offset_t)bp->b_data; /* @@ -894,13 +894,13 @@ spec_getpages(ap) /* * Free the buffer header back to the swap buffer pool. */ - relpbuf(bp); + relpbuf(bp, NULL); return VM_PAGER_ERROR; } /* * Free the buffer header back to the swap buffer pool. */ - relpbuf(bp); + relpbuf(bp, NULL); return VM_PAGER_OK; } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 66c9b63..2a378d3 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -39,7 +39,7 @@ * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.218 1999/01/09 21:41:22 dt Exp $ + * $Id: pmap.c,v 1.219 1999/01/12 00:17:53 eivind Exp $ */ /* @@ -942,7 +942,7 @@ pmap_page_lookup(object, pindex) vm_page_t m; retry: m = vm_page_lookup(object, pindex); - if (m && vm_page_sleep(m, "pplookp", NULL)) + if (m && vm_page_sleep_busy(m, FALSE, "pplookp")) goto retry; return m; } @@ -1009,8 +1009,8 @@ pmap_new_proc(p) } vm_page_wakeup(m); - m->flags &= ~PG_ZERO; - m->flags |= PG_MAPPED | PG_WRITEABLE; + vm_page_flag_clear(m, PG_ZERO); + vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); m->valid = VM_PAGE_BITS_ALL; } if (updateneeded) @@ -1038,7 +1038,7 @@ pmap_dispose_proc(p) if ((m = vm_page_lookup(upobj, i)) == NULL) panic("pmap_dispose_proc: upage already missing???"); - m->flags |= PG_BUSY; + vm_page_busy(m); oldpte = *(ptek + i); *(ptek + i) = 0; @@ -1107,7 +1107,7 @@ pmap_swapin_proc(p) vm_page_wire(m); vm_page_wakeup(m); - m->flags |= PG_MAPPED | PG_WRITEABLE; + vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); } } @@ -1122,7 +1122,8 @@ pmap_swapin_proc(p) static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { - while (vm_page_sleep(m, "pmuwpt", NULL)); + while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) + ; if (m->hold_count == 0) { vm_offset_t pteva; @@ -1150,12 +1151,8 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { --m->wire_count; if (m->wire_count == 0) { - if (m->flags & PG_WANTED) { - m->flags &= ~PG_WANTED; - wakeup(m); - } - - m->flags |= PG_BUSY; + vm_page_flash(m); + vm_page_busy(m); vm_page_free_zero(m); --cnt.v_wire_count; } @@ -1257,7 +1254,8 @@ pmap_pinit(pmap) ptdpg->wire_count = 1; ++cnt.v_wire_count; - ptdpg->flags &= ~(PG_MAPPED | PG_BUSY); /* not mapped normally */ + + vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); @@ -1290,10 +1288,10 @@ pmap_release_free_page(pmap, p) * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ - if (vm_page_sleep(p, "pmaprl", NULL)) + if (vm_page_sleep_busy(p, FALSE, "pmaprl")) return 0; - p->flags |= PG_BUSY; + vm_page_busy(p); /* * Remove the page table page from the processes address space. @@ -1393,8 +1391,9 @@ _pmap_allocpte(pmap, ptepindex) } m->valid = VM_PAGE_BITS_ALL; - m->flags &= ~(PG_ZERO | PG_BUSY); - m->flags |= PG_MAPPED; + vm_page_flag_clear(m, PG_ZERO); + vm_page_flag_set(m, PG_MAPPED); + vm_page_wakeup(m); return m; } @@ -1713,7 +1712,7 @@ pmap_remove_entry(pmap, ppv, va) TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; if (TAILQ_FIRST(&ppv->pv_list) == NULL) - ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); + vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); @@ -1791,7 +1790,7 @@ pmap_remove_pte(pmap, ptq, va) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (oldpte & PG_A) - ppv->pv_vm_page->flags |= PG_REFERENCED; + vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED); return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); @@ -1976,7 +1975,7 @@ pmap_remove_all(pa) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) - ppv->pv_vm_page->flags |= PG_REFERENCED; + vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. @@ -2005,7 +2004,7 @@ pmap_remove_all(pa) free_pv_entry(pv); } - ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); + vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); if (update_needed) invltlb(); @@ -2081,7 +2080,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) ppv = NULL; if (pbits & PG_A) { ppv = pa_to_pvh(pbits); - ppv->pv_vm_page->flags |= PG_REFERENCED; + vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED); pbits &= ~PG_A; } if (pbits & PG_M) { @@ -2436,7 +2435,7 @@ pmap_object_init_pt(pmap, addr, object, pindex, size, limit) retry: p = vm_page_lookup(object, pindex); - if (p && vm_page_sleep(p, "init4p", NULL)) + if (p && vm_page_sleep_busy(p, FALSE, "init4p")) goto retry; if (p == NULL) { @@ -2469,7 +2468,7 @@ retry: ptepa += NBPDR; ptepindex += 1; } - p->flags |= PG_MAPPED; + vm_page_flag_set(p, PG_MAPPED); invltlb(); return; } @@ -2510,11 +2509,11 @@ retry: (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); - p->flags |= PG_BUSY; + vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); - p->flags |= PG_MAPPED; + vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } objpgs -= 1; @@ -2531,11 +2530,11 @@ retry: (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); - p->flags |= PG_BUSY; + vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); - p->flags |= PG_MAPPED; + vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } } @@ -2628,10 +2627,10 @@ pmap_prefault(pmap, addra, entry) if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } - m->flags |= PG_BUSY; + vm_page_busy(m); mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); - m->flags |= PG_MAPPED; + vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); } } @@ -3026,7 +3025,7 @@ pmap_remove_pages(pmap, sva, eva) ppv->pv_list_count--; TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { - ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); + vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); @@ -3406,7 +3405,7 @@ pmap_mincore(pmap, addr) */ else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) { val |= MINCORE_REFERENCED_OTHER; - m->flags |= PG_REFERENCED; + vm_page_flag_set(m, PG_REFERENCED); } } return val; diff --git a/sys/i386/i386/symbols.raw b/sys/i386/i386/symbols.raw index 4703c30..943d8ae 100644 --- a/sys/i386/i386/symbols.raw +++ b/sys/i386/i386/symbols.raw @@ -1,6 +1,6 @@ # @(#)symbols.raw 7.6 (Berkeley) 5/8/91 # -# $Id: symbols.raw,v 1.12 1998/03/30 09:48:20 phk Exp $ +# $Id: symbols.raw,v 1.13 1998/09/15 10:03:43 gibbs Exp $ # @@ -28,7 +28,8 @@ #pstat # _cons _nswap - _swaplist + _swapblist +# _swaplist #vmstat _cp_time # _rate diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index a9776a5..be9f9d3 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 - * $Id: kern_malloc.c,v 1.50 1999/01/08 17:31:09 eivind Exp $ + * $Id: kern_malloc.c,v 1.51 1999/01/10 01:58:24 eivind Exp $ */ #include "opt_vm.h" @@ -101,7 +101,16 @@ struct freelist { #endif /* INVARIANTS */ /* - * Allocate a block of memory + * malloc: + * + * Allocate a block of memory. + * + * If M_NOWAIT is set, this routine will not block and return NULL if + * the allocation fails. + * + * If M_ASLEEP is set (M_NOWAIT must also be set), this routine + * will have the side effect of calling asleep() if it returns NULL, + * allowing the parent to await() at some future time. */ void * malloc(size, type, flags) @@ -122,13 +131,26 @@ malloc(size, type, flags) #endif register struct malloc_type *ksp = type; - if (!type->ks_next) + /* + * Must be at splmem() prior to initializing segment to handle + * potential initialization race. + */ + + s = splmem(); + + if (!type->ks_next) { malloc_init(type); + } indx = BUCKETINDX(size); kbp = &bucket[indx]; - s = splmem(); + while (ksp->ks_memuse >= ksp->ks_limit) { + if (flags & M_ASLEEP) { + if (ksp->ks_limblocks < 65535) + ksp->ks_limblocks++; + asleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0); + } if (flags & M_NOWAIT) { splx(s); return ((void *) NULL); @@ -239,7 +261,11 @@ out: } /* - * Free a block of memory allocated by malloc. + * free: + * + * Free a block of memory allocated by malloc. + * + * This routine may not block. */ void free(addr, type) diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index 441d95f..ad63a98 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -16,7 +16,7 @@ * 4. Modifications may be freely made to this file if the above conditions * are met. * - * $Id: kern_physio.c,v 1.28 1998/08/19 10:50:32 sos Exp $ + * $Id: kern_physio.c,v 1.29 1998/10/25 17:44:51 phk Exp $ */ #include <sys/param.h> @@ -147,7 +147,7 @@ physio(strategy, bp, dev, rw, minp, uio) doerror: - relpbuf(bpa); + relpbuf(bpa, NULL); if (!bp_alloc) { bp->b_flags &= ~(B_BUSY|B_PHYS); if( bp->b_flags & B_WANTED) { @@ -197,13 +197,13 @@ phygetvpbuf(dev_t dev, int resid) bdsw = cdevsw[major(dev)]; if ((bdsw == NULL) || (bdsw->d_bmaj == -1)) - return getpbuf(); + return getpbuf(NULL); maxio = bdsw->d_maxio; if (resid > maxio) resid = maxio; - return getpbuf(); + return getpbuf(NULL); } static void diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c index d637ab4..810b87e 100644 --- a/sys/kern/subr_rlist.c +++ b/sys/kern/subr_rlist.c @@ -13,7 +13,7 @@ * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This software is a component of "386BSD" developed by - William F. Jolitz, TeleMuse. + * William F. Jolitz, TeleMuse. * 4. Neither the name of the developer nor the name "386BSD" * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -54,9 +54,13 @@ * functioning of this software, nor does the author assume any responsibility * for damages incurred with its use. * - * $Id: subr_rlist.c,v 1.28 1999/01/08 17:31:12 eivind Exp $ + * --------- DEPRECIATED --------- + * + * $Id: subr_rlist.c,v 1.29 1999/01/10 01:58:25 eivind Exp $ */ +#if 0 + #include <sys/param.h> #include <sys/systm.h> #include <sys/rlist.h> @@ -307,3 +311,6 @@ rlist_destroy (rlh) rlist_mfree(lp); } } + +#endif + diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index edc74a7..a6c2dfe 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -1,4 +1,4 @@ -/* $Id: sysv_shm.c,v 1.38 1998/08/24 08:39:38 dfr Exp $ */ +/* $Id: sysv_shm.c,v 1.39 1998/10/13 08:24:40 dg Exp $ */ /* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */ /* @@ -52,6 +52,7 @@ #include <vm/pmap.h> #include <vm/vm_object.h> #include <vm/vm_map.h> +#include <vm/vm_page.h> #include <vm/vm_pager.h> #include <vm/vm_inherit.h> diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 6cc487a..1634681 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 - * $Id: uipc_syscalls.c,v 1.48 1998/12/03 12:35:47 dg Exp $ + * $Id: uipc_syscalls.c,v 1.49 1998/12/07 21:58:29 archie Exp $ */ #include "opt_compat.h" @@ -1543,7 +1543,13 @@ retry_lookup: VM_WAIT; goto retry_lookup; } - vm_page_flag_clear(pg, PG_BUSY); + /* + * don't just clear PG_BUSY manually - + * vm_page_alloc() should be considered opaque, + * use the VM routine provided to clear + * PG_BUSY. + */ + vm_page_wakeup(pg); } /* * Ensure that our page is still around when the I/O completes. @@ -1583,21 +1589,12 @@ retry_lookup: goto done; } } else { - if ((pg->flags & PG_BUSY) || pg->busy) { - s = splvm(); - if ((pg->flags & PG_BUSY) || pg->busy) { - /* - * Page is busy. Wait and retry. - */ - vm_page_flag_set(pg, PG_WANTED); - tsleep(pg, PVM, "sfpbsy", 0); - splx(s); - goto retry_lookup; - } - splx(s); - } + if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) + goto retry_lookup; + /* - * Protect from having the page ripped out from beneath us. + * Protect from having the page ripped out from + * beneath us. */ vm_page_wire(pg); } diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 93f6164..d528f5e 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 - * $Id: uipc_usrreq.c,v 1.36 1998/07/15 02:32:12 bde Exp $ + * $Id: uipc_usrreq.c,v 1.37 1998/10/25 17:44:51 phk Exp $ */ #include <sys/param.h> @@ -1114,8 +1114,11 @@ unp_gc() /* * for each FD on our hit list, do the following two things */ - for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) - sorflush((struct socket *)(*fpp)->f_data); + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { + struct file *tfp = *fpp; + if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) + sorflush((struct socket *)(tfp->f_data)); + } for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) closef(*fpp, (struct proc *) NULL); free((caddr_t)extra_ref, M_FILE); diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index c7c8aa9..c1af873 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -13,7 +13,7 @@ * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. * - * $Id: vfs_aio.c,v 1.35 1998/11/27 01:14:21 tegge Exp $ + * $Id: vfs_aio.c,v 1.36 1998/12/15 17:38:33 des Exp $ */ /* @@ -386,7 +386,7 @@ aio_free_entry(struct aiocblist *aiocbe) splx(s); if (aiocbe->bp) { vunmapbuf(aiocbe->bp); - relpbuf(aiocbe->bp); + relpbuf(aiocbe->bp, NULL); aiocbe->bp = NULL; } } @@ -1035,7 +1035,7 @@ aio_qphysio(p, aiocbe) } /* create and build a buffer header for a transfer */ - bp = (struct buf *)getpbuf(); + bp = (struct buf *)getpbuf(NULL); /* * get a copy of the kva from the physical buffer @@ -1122,7 +1122,7 @@ doerror: lj->lioj_buffer_count--; } aiocbe->bp = NULL; - relpbuf(bp); + relpbuf(bp, NULL); return error; } @@ -1172,7 +1172,7 @@ aio_fphysio(p, iocb, flgwait) error = bp->b_error; } - relpbuf(bp); + relpbuf(bp, NULL); return (error); } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 30018b5..3bb204e 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -11,7 +11,7 @@ * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * - * $Id: vfs_bio.c,v 1.192 1999/01/12 11:59:34 eivind Exp $ + * $Id: vfs_bio.c,v 1.193 1999/01/19 08:00:51 dillon Exp $ */ /* @@ -562,7 +562,7 @@ brelse(struct buf * bp) int s; if (bp->b_flags & B_CLUSTER) { - relpbuf(bp); + relpbuf(bp, NULL); return; } @@ -1364,6 +1364,7 @@ vfs_setdirty(struct buf *bp) { break; } } + boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); if (boffset < bp->b_dirtyoff) { bp->b_dirtyoff = max(boffset, 0); @@ -1412,7 +1413,6 @@ loop: if ((bp = gbincore(vp, blkno))) { if (bp->b_flags & B_BUSY) { - bp->b_flags |= B_WANTED; if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; @@ -1429,16 +1429,13 @@ loop: bremfree(bp); /* - * check for size inconsistancies (note that they shouldn't - * happen but do when filesystems don't handle the size changes - * correctly.) We are conservative on metadata and don't just - * extend the buffer but write (if needed) and re-constitute it. + * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { - if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) { - allocbuf(bp, size); - } else { + if ((bp->b_flags & B_VMIO) == 0 || + (size > bp->b_kvasize) + ) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp); @@ -1455,15 +1452,26 @@ loop: goto loop; } } + + /* + * If the size is inconsistant in the VMIO case, we can resize + * the buffer. This might lead to B_CACHE getting cleared. + */ + + if (bp->b_bcount != size) + allocbuf(bp, size); + KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); + /* * Check that the constituted buffer really deserves for the * B_CACHE bit to be set. B_VMIO type buffers might not * contain fully valid pages. Normal (old-style) buffers - * should be fully valid. + * should be fully valid. This might also lead to B_CACHE + * getting clear. */ - if (bp->b_flags & B_VMIO) { + if ((bp->b_flags & B_VMIO|B_CACHE) == (B_VMIO|B_CACHE)) { int checksize = bp->b_bufsize; int poffset = bp->b_offset & PAGE_MASK; int resid; @@ -1479,6 +1487,19 @@ loop: } } + /* + * If B_DELWRI is set and B_CACHE got cleared ( or was + * already clear ), we have to commit the write and + * retry. The NFS code absolutely depends on this, + * and so might the FFS code. In anycase, it formalizes + * the B_CACHE rules. See sys/buf.h. + */ + + if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { + VOP_BWRITE(bp); + goto loop; + } + if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; splx(s); @@ -1572,19 +1593,18 @@ geteblk(int size) /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated - * VM object (in the case of VMIO operations). + * VM object (in the case of VMIO operations). This code is able to + * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve - * deadlock or inconsistant data situations. Tread lightly!!! - * - * Modify the length of a buffer's underlying buffer storage without - * destroying information (unless, of course the buffer is shrinking). + * deadlock or inconsistant data situations. Tread lightly!!! + * There are B_CACHE and B_DELWRI interactions that must be dealt with by + * the caller. Calling this code willy nilly can result in the loss of data. */ + int -allocbuf(struct buf * bp, int size) +allocbuf(struct buf *bp, int size) { - - int s; int newbsize, mbsize; int i; @@ -1705,7 +1725,8 @@ allocbuf(struct buf * bp, int size) m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); - vm_page_sleep(m, "biodep", &m->busy); + while (vm_page_sleep_busy(m, TRUE, "biodep")) + ; bp->b_pages[i] = NULL; vm_page_unwire(m, 0); @@ -1771,16 +1792,25 @@ allocbuf(struct buf * bp, int size) } vm_page_wire(m); - vm_page_flag_clear(m, PG_BUSY); + vm_page_wakeup(m); bp->b_flags &= ~B_CACHE; - } else if (m->flags & PG_BUSY) { - s = splvm(); - if (m->flags & PG_BUSY) { - vm_page_flag_set(m, PG_WANTED); - tsleep(m, PVM, "pgtblk", 0); - } - splx(s); + } else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) { + /* + * If we had to sleep, retry. + * + * Also note that we only test + * PG_BUSY here, not m->busy. + * + * We cannot sleep on m->busy + * here because a vm_fault -> + * getpages -> cluster-read -> + * ...-> allocbuf sequence + * will convert PG_BUSY to + * m->busy so we have to let + * m->busy through if we do + * not want to deadlock. + */ goto doretry; } else { if ((curproc != pageproc) && @@ -2010,12 +2040,8 @@ biodone(register struct buf * bp) foff += resid; iosize -= resid; } - if (obj && - (obj->paging_in_progress == 0) && - (obj->flags & OBJ_PIPWNT)) { - vm_object_clear_flag(obj, OBJ_PIPWNT); - wakeup(obj); - } + if (obj) + vm_object_pip_wakeupn(obj, 0); } /* * For asynchronous completions, release the buffer now. The brelse @@ -2096,11 +2122,7 @@ vfs_unbusy_pages(struct buf * bp) vm_page_flag_clear(m, PG_ZERO); vm_page_io_finish(m); } - if (obj->paging_in_progress == 0 && - (obj->flags & OBJ_PIPWNT)) { - vm_object_clear_flag(obj, OBJ_PIPWNT); - wakeup(obj); - } + vm_object_pip_wakeupn(obj, 0); } } @@ -2109,6 +2131,8 @@ vfs_unbusy_pages(struct buf * bp) * of a page. If the consumer is not NFS, and the page is not * valid for the entire range, clear the B_CACHE flag to force * the consumer to re-read the page. + * + * B_CACHE interaction is especially tricky. */ static void vfs_buf_set_valid(struct buf *bp, @@ -2135,13 +2159,16 @@ vfs_buf_set_valid(struct buf *bp, } evalid = min(evalid, off + size); /* - * Make sure this range is contiguous with the range - * built up from previous pages. If not, then we will - * just use the range from the previous pages. + * We can only set b_validoff/end if this range is contiguous + * with the range built up already. If we cannot set + * b_validoff/end, we must clear B_CACHE to force an update + * to clean the bp up. */ if (svalid == bp->b_validend) { bp->b_validoff = min(bp->b_validoff, svalid); bp->b_validend = max(bp->b_validend, evalid); + } else { + bp->b_flags &= ~B_CACHE; } } else if (!vm_page_is_valid(m, (vm_offset_t) ((foff + off) & PAGE_MASK), @@ -2154,6 +2181,10 @@ vfs_buf_set_valid(struct buf *bp, * Set the valid bits in a page, taking care of the b_validoff, * b_validend fields which NFS uses to optimise small reads. Off is * the offset within the file and pageno is the page index within the buf. + * + * XXX we have to set the valid & clean bits for all page fragments + * touched by b_validoff/validend, even if the page fragment goes somewhat + * beyond b_validoff/validend due to alignment. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) @@ -2208,7 +2239,7 @@ vfs_busy_pages(struct buf * bp, int clear_modify) retry: for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; - if (vm_page_sleep(m, "vbpage", NULL)) + if (vm_page_sleep_busy(m, FALSE, "vbpage")) goto retry; } diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index ce842ad..781508e 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -33,7 +33,7 @@ * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 - * $Id: vfs_cluster.c,v 1.76 1999/01/08 17:31:15 eivind Exp $ + * $Id: vfs_cluster.c,v 1.77 1999/01/10 01:58:25 eivind Exp $ */ #include "opt_debug_cluster.h" @@ -68,6 +68,8 @@ static struct buf * extern vm_page_t bogus_page; +extern int cluster_pbuf_freecnt; + /* * Maximum number of blocks for read-ahead. */ @@ -336,7 +338,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; - bp = trypbuf(); + bp = trypbuf(&cluster_pbuf_freecnt); if (bp == 0) return tbp; @@ -475,7 +477,7 @@ cluster_callback(bp) tbp->b_dirtyoff = tbp->b_dirtyend = 0; biodone(tbp); } - relpbuf(bp); + relpbuf(bp, &cluster_pbuf_freecnt); } /* @@ -654,7 +656,7 @@ cluster_wbuild(vp, size, start_lbn, len) (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || - ((bp = trypbuf()) == NULL)) { + ((bp = trypbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 179ef78..44b1698 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.181 1999/01/08 17:31:17 eivind Exp $ + * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $ */ /* @@ -63,10 +63,13 @@ #include <machine/limits.h> #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> #include <vm/vm_object.h> #include <vm/vm_extern.h> #include <vm/pmap.h> #include <vm/vm_map.h> +#include <vm/vm_page.h> #include <vm/vm_pager.h> #include <vm/vnode_pager.h> #include <vm/vm_zone.h> @@ -985,6 +988,10 @@ sched_sync(void) /* * Associate a p-buffer with a vnode. + * + * Also sets B_PAGING flag to indicate that vnode is not fully associated + * with the buffer. i.e. the bp has not been linked into the vnode or + * ref-counted. */ void pbgetvp(vp, bp) @@ -995,6 +1002,7 @@ pbgetvp(vp, bp) KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; + bp->b_flags |= B_PAGING; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else @@ -1011,7 +1019,34 @@ pbrelvp(bp) KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); +#if !defined(MAX_PERF) + /* XXX REMOVE ME */ + if (bp->b_vnbufs.tqe_next != NULL) { + panic( + "relpbuf(): b_vp was probably reassignbuf()d %p %x", + bp, + (int)bp->b_flags + ); + } +#endif bp->b_vp = (struct vnode *) 0; + bp->b_flags &= ~B_PAGING; +} + +void +pbreassignbuf(bp, newvp) + struct buf *bp; + struct vnode *newvp; +{ +#if !defined(MAX_PERF) + if ((bp->b_flags & B_PAGING) == 0) { + panic( + "pbreassignbuf() on non phys bp %p", + bp + ); + } +#endif + bp->b_vp = newvp; } /* @@ -1034,6 +1069,15 @@ reassignbuf(bp, newvp) return; } +#if !defined(MAX_PERF) + /* + * B_PAGING flagged buffers cannot be reassigned because their vp + * is not fully linked in. + */ + if (bp->b_flags & B_PAGING) + panic("cannot reassign paging buffer"); +#endif + s = splbio(); /* * Delete from old vnode list, if on one. diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 179ef78..44b1698 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.181 1999/01/08 17:31:17 eivind Exp $ + * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $ */ /* @@ -63,10 +63,13 @@ #include <machine/limits.h> #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> #include <vm/vm_object.h> #include <vm/vm_extern.h> #include <vm/pmap.h> #include <vm/vm_map.h> +#include <vm/vm_page.h> #include <vm/vm_pager.h> #include <vm/vnode_pager.h> #include <vm/vm_zone.h> @@ -985,6 +988,10 @@ sched_sync(void) /* * Associate a p-buffer with a vnode. + * + * Also sets B_PAGING flag to indicate that vnode is not fully associated + * with the buffer. i.e. the bp has not been linked into the vnode or + * ref-counted. */ void pbgetvp(vp, bp) @@ -995,6 +1002,7 @@ pbgetvp(vp, bp) KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; + bp->b_flags |= B_PAGING; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else @@ -1011,7 +1019,34 @@ pbrelvp(bp) KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); +#if !defined(MAX_PERF) + /* XXX REMOVE ME */ + if (bp->b_vnbufs.tqe_next != NULL) { + panic( + "relpbuf(): b_vp was probably reassignbuf()d %p %x", + bp, + (int)bp->b_flags + ); + } +#endif bp->b_vp = (struct vnode *) 0; + bp->b_flags &= ~B_PAGING; +} + +void +pbreassignbuf(bp, newvp) + struct buf *bp; + struct vnode *newvp; +{ +#if !defined(MAX_PERF) + if ((bp->b_flags & B_PAGING) == 0) { + panic( + "pbreassignbuf() on non phys bp %p", + bp + ); + } +#endif + bp->b_vp = newvp; } /* @@ -1034,6 +1069,15 @@ reassignbuf(bp, newvp) return; } +#if !defined(MAX_PERF) + /* + * B_PAGING flagged buffers cannot be reassigned because their vp + * is not fully linked in. + */ + if (bp->b_flags & B_PAGING) + panic("cannot reassign paging buffer"); +#endif + s = splbio(); /* * Delete from old vnode list, if on one. diff --git a/sys/miscfs/devfs/devfs_vnops.c b/sys/miscfs/devfs/devfs_vnops.c index e9bdc2a..56fa842 100644 --- a/sys/miscfs/devfs/devfs_vnops.c +++ b/sys/miscfs/devfs/devfs_vnops.c @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: devfs_vnops.c,v 1.64 1998/12/15 23:46:59 eivind Exp $ + * $Id: devfs_vnops.c,v 1.65 1999/01/12 11:49:29 eivind Exp $ */ @@ -1933,7 +1933,7 @@ devfs_getpages(struct vop_getpages_args *ap) blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); - bp = getpbuf(); + bp = getpbuf(NULL); kva = (vm_offset_t)bp->b_data; /* @@ -2042,13 +2042,13 @@ devfs_getpages(struct vop_getpages_args *ap) /* * Free the buffer header back to the swap buffer pool. */ - relpbuf(bp); + relpbuf(bp, NULL); return VM_PAGER_ERROR; } /* * Free the buffer header back to the swap buffer pool. */ - relpbuf(bp); + relpbuf(bp, NULL); return VM_PAGER_OK; } diff --git a/sys/miscfs/procfs/procfs_map.c b/sys/miscfs/procfs/procfs_map.c index 4dae10a..c6b8966 100644 --- a/sys/miscfs/procfs/procfs_map.c +++ b/sys/miscfs/procfs/procfs_map.c @@ -36,7 +36,7 @@ * * @(#)procfs_status.c 8.3 (Berkeley) 2/17/94 * - * $Id: procfs_map.c,v 1.17 1998/04/29 04:28:22 dyson Exp $ + * $Id: procfs_map.c,v 1.18 1998/12/04 22:54:51 archie Exp $ */ #include <sys/param.h> @@ -93,7 +93,7 @@ procfs_domap(curp, p, pfs, uio) ((uio->uio_resid > 0) && (entry != &map->header)); entry = entry->next) { vm_object_t obj, tobj, lobj; - int ref_count, shadow_count, id, flags; + int ref_count, shadow_count, flags; vm_offset_t addr; int resident, privateresident; char *type; @@ -139,13 +139,11 @@ case OBJT_DEVICE: flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; - id = obj->id; } else { type = "none"; flags = 0; ref_count = 0; shadow_count = 0; - id = 0; } @@ -154,9 +152,9 @@ case OBJT_DEVICE: * start, end, resident, private resident, cow, access, type. */ snprintf(mebuffer, sizeof(mebuffer), - "0x%x 0x%x %d %d %d %s%s%s %d %d 0x%x %s %s %s\n", + "0x%x 0x%x %d %d %p %s%s%s %d %d 0x%x %s %s %s\n", entry->start, entry->end, - resident, privateresident, id, + resident, privateresident, obj, (entry->protection & VM_PROT_READ)?"r":"-", (entry->protection & VM_PROT_WRITE)?"w":"-", (entry->protection & VM_PROT_EXECUTE)?"x":"-", diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index ff0f347..6096a1b 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 - * $Id: spec_vnops.c,v 1.77 1998/12/07 21:58:33 archie Exp $ + * $Id: spec_vnops.c,v 1.78 1998/12/16 00:10:51 eivind Exp $ */ #include <sys/param.h> @@ -781,7 +781,7 @@ spec_getpages(ap) blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); - bp = getpbuf(); + bp = getpbuf(NULL); kva = (vm_offset_t)bp->b_data; /* @@ -894,13 +894,13 @@ spec_getpages(ap) /* * Free the buffer header back to the swap buffer pool. */ - relpbuf(bp); + relpbuf(bp, NULL); return VM_PAGER_ERROR; } /* * Free the buffer header back to the swap buffer pool. */ - relpbuf(bp); + relpbuf(bp, NULL); return VM_PAGER_OK; } diff --git a/sys/net/if_sl.c b/sys/net/if_sl.c index 99a6978..151df6e 100644 --- a/sys/net/if_sl.c +++ b/sys/net/if_sl.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)if_sl.c 8.6 (Berkeley) 2/1/94 - * $Id: if_sl.c,v 1.69 1998/06/07 17:12:05 dfr Exp $ + * $Id: if_sl.c,v 1.70 1998/07/15 02:32:23 bde Exp $ */ /* @@ -70,7 +70,9 @@ #include "bpfilter.h" #include "opt_inet.h" - +#if !defined(ACTUALLY_LKM_NOT_KERNEL) && !defined(KLD_MODULE) +#include "opt_slip.h" +#endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> @@ -217,7 +219,11 @@ slattach(dummy) sc->sc_if.if_unit = i++; sc->sc_if.if_mtu = SLMTU; sc->sc_if.if_flags = - IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST; +#ifdef SLIP_IFF_OPTS + SLIP_IFF_OPTS; +#else + IFF_BROADCAST | IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST; +#endif sc->sc_if.if_type = IFT_SLIP; sc->sc_if.if_ioctl = slioctl; sc->sc_if.if_output = sloutput; diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index c973700..fb437a5 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 - * $Id: nfs_bio.c,v 1.64 1998/12/07 21:58:43 archie Exp $ + * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $ */ @@ -68,6 +68,7 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, static void nfs_prot_buf __P((struct buf *bp, int off, int n)); extern int nfs_numasync; +extern int nfs_pbuf_freecnt; extern struct nfsstats nfsstats; /* @@ -113,7 +114,7 @@ nfs_getpages(ap) * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ - bp = getpbuf(); + bp = getpbuf(&nfs_pbuf_freecnt); npages = btoc(count); kva = (vm_offset_t) bp->b_data; @@ -132,10 +133,16 @@ nfs_getpages(ap) error = nfs_readrpc(vp, &uio, cred); pmap_qremove(kva, npages); - relpbuf(bp); + relpbuf(bp, &nfs_pbuf_freecnt); - if (error && (uio.uio_resid == count)) + if (error && (uio.uio_resid == count)) { + printf("nfs_getpages: error %d\n", error); + for (i = 0; i < npages; ++i) { + if (i != ap->a_reqpage) + vnode_pager_freepage(pages[i]); + } return VM_PAGER_ERROR; + } size = count - uio.uio_resid; @@ -228,7 +235,7 @@ nfs_putpages(ap) * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ - bp = getpbuf(); + bp = getpbuf(&nfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -251,7 +258,7 @@ nfs_putpages(ap) error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); pmap_qremove(kva, npages); - relpbuf(bp); + relpbuf(bp, &nfs_pbuf_freecnt); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; @@ -439,6 +446,7 @@ again: bp = nfs_getcacheblk(vp, lbn, bufsize, p); if (!bp) return (EINTR); + /* * If we are being called from nfs_getpages, we must * make sure the buffer is a vmio buffer. The vp will @@ -779,6 +787,7 @@ again: * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. */ + if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { bp->b_proc = p; @@ -1254,17 +1263,24 @@ nfs_doio(bp, cr, p) * write rpc with iomode == NFSV3WRITE_FILESYNC before * the block is reused. This is indicated by setting * the B_DELWRI and B_NEEDCOMMIT flags. + * + * If the buffer is marked B_PAGING, it does not reside on + * the vp's paging queues so we do not ( and cannot ) reassign + * it. XXX numdirtybuffers should be integrated into + * reassignbuf() call. */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; bp->b_flags &= ~(B_INVAL|B_NOCACHE); - ++numdirtybuffers; - bp->b_flags |= B_DELWRI; - s = splbio(); - reassignbuf(bp, vp); - splx(s); + if ((bp->b_flags & B_PAGING) == 0) { + ++numdirtybuffers; + bp->b_flags |= B_DELWRI; + s = splbio(); + reassignbuf(bp, vp); + splx(s); + } if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; } else { diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c index b3eec24..6c9cfb7 100644 --- a/sys/nfs/nfs_common.c +++ b/sys/nfs/nfs_common.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $ + * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $ */ /* @@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= { }; int nfs_ticks; +int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; @@ -1191,6 +1192,8 @@ nfs_init(vfsp) sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif + nfs_pbuf_freecnt = nswbuf / 2 + 1; + return (0); } diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c index b3eec24..6c9cfb7 100644 --- a/sys/nfs/nfs_subs.c +++ b/sys/nfs/nfs_subs.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $ + * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $ */ /* @@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= { }; int nfs_ticks; +int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; @@ -1191,6 +1192,8 @@ nfs_init(vfsp) sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif + nfs_pbuf_freecnt = nswbuf / 2 + 1; + return (0); } diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index c97267a..4131b60 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.115 1998/12/25 10:34:27 dfr Exp $ + * $Id: nfs_vnops.c,v 1.116 1999/01/12 12:39:14 eivind Exp $ */ @@ -2627,14 +2627,17 @@ nfs_strategy(ap) if (bp->b_flags & B_PHYS) panic("nfs physio"); + if (bp->b_flags & B_ASYNC) p = (struct proc *)0; else p = curproc; /* XXX */ + if (bp->b_flags & B_READ) cr = bp->b_rcred; else cr = bp->b_wcred; + /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index c973700..fb437a5 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 - * $Id: nfs_bio.c,v 1.64 1998/12/07 21:58:43 archie Exp $ + * $Id: nfs_bio.c,v 1.65 1998/12/14 17:51:30 dt Exp $ */ @@ -68,6 +68,7 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, static void nfs_prot_buf __P((struct buf *bp, int off, int n)); extern int nfs_numasync; +extern int nfs_pbuf_freecnt; extern struct nfsstats nfsstats; /* @@ -113,7 +114,7 @@ nfs_getpages(ap) * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ - bp = getpbuf(); + bp = getpbuf(&nfs_pbuf_freecnt); npages = btoc(count); kva = (vm_offset_t) bp->b_data; @@ -132,10 +133,16 @@ nfs_getpages(ap) error = nfs_readrpc(vp, &uio, cred); pmap_qremove(kva, npages); - relpbuf(bp); + relpbuf(bp, &nfs_pbuf_freecnt); - if (error && (uio.uio_resid == count)) + if (error && (uio.uio_resid == count)) { + printf("nfs_getpages: error %d\n", error); + for (i = 0; i < npages; ++i) { + if (i != ap->a_reqpage) + vnode_pager_freepage(pages[i]); + } return VM_PAGER_ERROR; + } size = count - uio.uio_resid; @@ -228,7 +235,7 @@ nfs_putpages(ap) * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ - bp = getpbuf(); + bp = getpbuf(&nfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -251,7 +258,7 @@ nfs_putpages(ap) error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); pmap_qremove(kva, npages); - relpbuf(bp); + relpbuf(bp, &nfs_pbuf_freecnt); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; @@ -439,6 +446,7 @@ again: bp = nfs_getcacheblk(vp, lbn, bufsize, p); if (!bp) return (EINTR); + /* * If we are being called from nfs_getpages, we must * make sure the buffer is a vmio buffer. The vp will @@ -779,6 +787,7 @@ again: * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. */ + if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { bp->b_proc = p; @@ -1254,17 +1263,24 @@ nfs_doio(bp, cr, p) * write rpc with iomode == NFSV3WRITE_FILESYNC before * the block is reused. This is indicated by setting * the B_DELWRI and B_NEEDCOMMIT flags. + * + * If the buffer is marked B_PAGING, it does not reside on + * the vp's paging queues so we do not ( and cannot ) reassign + * it. XXX numdirtybuffers should be integrated into + * reassignbuf() call. */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; bp->b_flags &= ~(B_INVAL|B_NOCACHE); - ++numdirtybuffers; - bp->b_flags |= B_DELWRI; - s = splbio(); - reassignbuf(bp, vp); - splx(s); + if ((bp->b_flags & B_PAGING) == 0) { + ++numdirtybuffers; + bp->b_flags |= B_DELWRI; + s = splbio(); + reassignbuf(bp, vp); + splx(s); + } if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; } else { diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c index b3eec24..6c9cfb7 100644 --- a/sys/nfsclient/nfs_subs.c +++ b/sys/nfsclient/nfs_subs.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $ + * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $ */ /* @@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= { }; int nfs_ticks; +int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; @@ -1191,6 +1192,8 @@ nfs_init(vfsp) sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif + nfs_pbuf_freecnt = nswbuf / 2 + 1; + return (0); } diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c index c97267a..4131b60 100644 --- a/sys/nfsclient/nfs_vnops.c +++ b/sys/nfsclient/nfs_vnops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.115 1998/12/25 10:34:27 dfr Exp $ + * $Id: nfs_vnops.c,v 1.116 1999/01/12 12:39:14 eivind Exp $ */ @@ -2627,14 +2627,17 @@ nfs_strategy(ap) if (bp->b_flags & B_PHYS) panic("nfs physio"); + if (bp->b_flags & B_ASYNC) p = (struct proc *)0; else p = curproc; /* XXX */ + if (bp->b_flags & B_READ) cr = bp->b_rcred; else cr = bp->b_wcred; + /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c index b3eec24..6c9cfb7 100644 --- a/sys/nfsserver/nfs_srvsubs.c +++ b/sys/nfsserver/nfs_srvsubs.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.69 1998/12/14 18:54:03 dt Exp $ + * $Id: nfs_subs.c,v 1.70 1999/01/05 18:49:58 eivind Exp $ */ /* @@ -99,6 +99,7 @@ enum vtype nv3tov_type[8]= { }; int nfs_ticks; +int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; @@ -1191,6 +1192,8 @@ nfs_init(vfsp) sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif + nfs_pbuf_freecnt = nswbuf / 2 + 1; + return (0); } diff --git a/sys/sys/bio.h b/sys/sys/bio.h index 191fdbc..f2b0f4b 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.60 1998/10/31 14:05:11 peter Exp $ + * $Id: buf.h,v 1.61 1998/11/13 01:01:44 dg Exp $ */ #ifndef _SYS_BUF_H_ @@ -116,7 +116,10 @@ struct buf { caddr_t b_savekva; /* saved kva for transfer while bouncing */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ - void *b_spc; + union pager_info { + void *pg_spc; + int pg_reqpage; + } b_pager; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; @@ -126,9 +129,29 @@ struct buf { struct workhead b_dep; /* List of filesystem dependencies. */ }; +#define b_spc b_pager.pg_spc + /* * These flags are kept in b_flags. + * + * Notes: + * + * B_ASYNC VOP calls on bp's are usually async whether or not + * B_ASYNC is set, but some subsystems, such as NFS, like + * to know what is best for the caller so they can + * optimize the I/O. + * + * B_PAGING Indicates that bp is being used by the paging system or + * some paging system and that the bp is not linked into + * the b_vp's clean/dirty linked lists or ref counts. + * Buffer vp reassignments are illegal in this case. + * + * B_CACHE This may only be set if the buffer is entirely valid. + * The situation where B_DELWRI is set and B_CACHE gets + * cleared MUST be committed to disk so B_DELWRI can + * also be cleared. */ + #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ @@ -312,13 +335,12 @@ int bowrite __P((struct buf *)); void brelse __P((struct buf *)); void bqrelse __P((struct buf *)); int vfs_bio_awrite __P((struct buf *)); -struct buf * getpbuf __P((void)); +struct buf * getpbuf __P((int *)); struct buf *incore __P((struct vnode *, daddr_t)); struct buf *gbincore __P((struct vnode *, daddr_t)); int inmem __P((struct vnode *, daddr_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); -int allocbuf __P((struct buf *, int)); int biowait __P((struct buf *)); void biodone __P((struct buf *)); @@ -336,13 +358,15 @@ void vfs_unbusy_pages __P((struct buf *)); void vwakeup __P((struct buf *)); void vmapbuf __P((struct buf *)); void vunmapbuf __P((struct buf *)); -void relpbuf __P((struct buf *)); +void relpbuf __P((struct buf *, int *)); void brelvp __P((struct buf *)); void bgetvp __P((struct vnode *, struct buf *)); void pbgetvp __P((struct vnode *, struct buf *)); void pbrelvp __P((struct buf *)); +int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); -struct buf *trypbuf __P((void)); +void bpreassignbuf __P((struct buf *, struct vnode *)); +struct buf *trypbuf __P((int *)); void vfs_bio_need_satisfy __P((void)); #endif /* KERNEL */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 191fdbc..f2b0f4b 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.60 1998/10/31 14:05:11 peter Exp $ + * $Id: buf.h,v 1.61 1998/11/13 01:01:44 dg Exp $ */ #ifndef _SYS_BUF_H_ @@ -116,7 +116,10 @@ struct buf { caddr_t b_savekva; /* saved kva for transfer while bouncing */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ - void *b_spc; + union pager_info { + void *pg_spc; + int pg_reqpage; + } b_pager; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; @@ -126,9 +129,29 @@ struct buf { struct workhead b_dep; /* List of filesystem dependencies. */ }; +#define b_spc b_pager.pg_spc + /* * These flags are kept in b_flags. + * + * Notes: + * + * B_ASYNC VOP calls on bp's are usually async whether or not + * B_ASYNC is set, but some subsystems, such as NFS, like + * to know what is best for the caller so they can + * optimize the I/O. + * + * B_PAGING Indicates that bp is being used by the paging system or + * some paging system and that the bp is not linked into + * the b_vp's clean/dirty linked lists or ref counts. + * Buffer vp reassignments are illegal in this case. + * + * B_CACHE This may only be set if the buffer is entirely valid. + * The situation where B_DELWRI is set and B_CACHE gets + * cleared MUST be committed to disk so B_DELWRI can + * also be cleared. */ + #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ @@ -312,13 +335,12 @@ int bowrite __P((struct buf *)); void brelse __P((struct buf *)); void bqrelse __P((struct buf *)); int vfs_bio_awrite __P((struct buf *)); -struct buf * getpbuf __P((void)); +struct buf * getpbuf __P((int *)); struct buf *incore __P((struct vnode *, daddr_t)); struct buf *gbincore __P((struct vnode *, daddr_t)); int inmem __P((struct vnode *, daddr_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); -int allocbuf __P((struct buf *, int)); int biowait __P((struct buf *)); void biodone __P((struct buf *)); @@ -336,13 +358,15 @@ void vfs_unbusy_pages __P((struct buf *)); void vwakeup __P((struct buf *)); void vmapbuf __P((struct buf *)); void vunmapbuf __P((struct buf *)); -void relpbuf __P((struct buf *)); +void relpbuf __P((struct buf *, int *)); void brelvp __P((struct buf *)); void bgetvp __P((struct vnode *, struct buf *)); void pbgetvp __P((struct vnode *, struct buf *)); void pbrelvp __P((struct buf *)); +int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); -struct buf *trypbuf __P((void)); +void bpreassignbuf __P((struct buf *, struct vnode *)); +struct buf *trypbuf __P((int *)); void vfs_bio_need_satisfy __P((void)); #endif /* KERNEL */ diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h index d8e0cd8..87949b8 100644 --- a/sys/sys/malloc.h +++ b/sys/sys/malloc.h @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)malloc.h 8.5 (Berkeley) 5/3/95 - * $Id: malloc.h,v 1.37 1998/03/08 09:58:26 julian Exp $ + * $Id: malloc.h,v 1.38 1998/11/10 08:46:24 peter Exp $ */ #ifndef _SYS_MALLOC_H_ @@ -42,11 +42,13 @@ #define KMEMSTATS /* - * flags to malloc + * flags to malloc. */ + #define M_WAITOK 0x0000 -#define M_NOWAIT 0x0001 -#define M_KERNEL 0x0002 +#define M_NOWAIT 0x0001 /* do not block */ +#define M_USE_RESERVE 0x0002 /* can alloc out of reserve memory */ +#define M_ASLEEP 0x0004 /* async sleep on failure */ #define M_MAGIC 877983977 /* time when first defined :-) */ diff --git a/sys/sys/param.h b/sys/sys/param.h index badddca..fb15db3 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 - * $Id: param.h,v 1.37 1998/10/16 04:28:04 jkh Exp $ + * $Id: param.h,v 1.38 1998/10/16 06:55:07 jkh Exp $ */ #ifndef _SYS_PARAM_H_ @@ -227,4 +227,10 @@ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<<FSHIFT) +#define dbtoc(db) /* calculates devblks to pages */ \ + ((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT)) + +#define ctodb(db) /* calculates pages to devblks */ \ + ((db) << (PAGE_SHIFT - DEV_BSHIFT)) + #endif /* _SYS_PARAM_H_ */ diff --git a/sys/sys/types.h b/sys/sys/types.h index 93f8698..c65fe67 100644 --- a/sys/sys/types.h +++ b/sys/sys/types.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)types.h 8.6 (Berkeley) 2/19/95 - * $Id: types.h,v 1.25 1998/06/07 17:13:05 dfr Exp $ + * $Id: types.h,v 1.26 1998/12/19 00:02:34 dt Exp $ */ #ifndef _SYS_TYPES_H_ @@ -68,6 +68,7 @@ typedef quad_t * qaddr_t; typedef char * caddr_t; /* core address */ typedef int32_t daddr_t; /* disk address */ +typedef u_int32_t u_daddr_t; /* unsigned disk address */ typedef u_int32_t dev_t; /* device number */ typedef u_int32_t fixpt_t; /* fixed point number */ typedef u_int32_t gid_t; /* group id */ diff --git a/sys/ufs/mfs/mfs_extern.h b/sys/ufs/mfs/mfs_extern.h index ca19cc4..ae5b7af 100644 --- a/sys/ufs/mfs/mfs_extern.h +++ b/sys/ufs/mfs/mfs_extern.h @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)mfs_extern.h 8.4 (Berkeley) 3/30/95 - * $Id: mfs_extern.h,v 1.10 1997/10/16 10:50:00 phk Exp $ + * $Id: mfs_extern.h,v 1.11 1998/02/03 21:52:02 bde Exp $ */ #ifndef _UFS_MFS_MFS_EXTERN_H_ @@ -41,8 +41,9 @@ struct buf; struct mount; struct proc; struct vnode; +struct mfsnode; -void mfs_doio __P((struct buf *bp, caddr_t base)); +void mfs_doio __P((struct buf *bp, struct mfsnode *mfsnode)); int mfs_mountfs __P((struct vnode *, struct mount *, struct proc *)); int mfs_mountroot __P((void)); diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c index 1ea0804..73ab75a 100644 --- a/sys/ufs/mfs/mfs_vfsops.c +++ b/sys/ufs/mfs/mfs_vfsops.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)mfs_vfsops.c 8.11 (Berkeley) 6/19/95 - * $Id: mfs_vfsops.c,v 1.52 1998/12/07 21:58:49 archie Exp $ + * $Id: mfs_vfsops.c,v 1.53 1999/01/01 04:14:11 dillon Exp $ */ @@ -64,8 +64,10 @@ MALLOC_DEFINE(M_MFSNODE, "MFS node", "MFS vnode private part"); u_char * mfs_getimage __P((void)); +#ifdef MFS_ROOT static caddr_t mfs_rootbase; /* address of mini-root in kernel virtual memory */ static u_long mfs_rootsize; /* size of mini-root in bytes */ +#endif static int mfs_minor; /* used for building internal dev_t */ @@ -178,7 +180,9 @@ mfs_mount(mp, path, data, ndp, p) struct mfs_args args; struct ufsmount *ump; struct fs *fs; +#ifdef MFS_ROOT u_char *base; +#endif struct mfsnode *mfsp; u_int size; int flags, err; @@ -344,7 +348,9 @@ mfs_mount(mp, path, data, ndp, p) goto error_2; } +#ifdef MFS_ROOT dostatfs: +#endif /* * Initialize FS stat information in mount struct; uses both * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname @@ -387,11 +393,8 @@ mfs_start(mp, flags, p) register struct vnode *vp = VFSTOUFS(mp)->um_devvp; register struct mfsnode *mfsp = VTOMFS(vp); register struct buf *bp; - register caddr_t base; register int gotsig = 0; - base = mfsp->mfs_baseoff; - /* * Must set P_SYSTEM to prevent system from trying to kill * this process. What happens is that the process is unkillable, @@ -402,11 +405,20 @@ mfs_start(mp, flags, p) curproc->p_flag |= P_SYSTEM; while (mfsp->mfs_active) { + int s; + + s = splbio(); + while (bp = bufq_first(&mfsp->buf_queue)) { bufq_remove(&mfsp->buf_queue, bp); - mfs_doio(bp, base); + splx(s); + mfs_doio(bp, mfsp); wakeup((caddr_t)bp); + s = splbio(); } + + splx(s); + /* * If a non-ignored signal is received, try to unmount. * If that fails, clear the signal (it has been "processed"), diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c index 88cfec6..083843c 100644 --- a/sys/ufs/mfs/mfs_vnops.c +++ b/sys/ufs/mfs/mfs_vnops.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95 - * $Id: mfs_vnops.c,v 1.37 1998/07/11 07:46:05 bde Exp $ + * $Id: mfs_vnops.c,v 1.38 1998/09/07 06:52:01 phk Exp $ */ #include <sys/param.h> @@ -41,6 +41,8 @@ #include <sys/buf.h> #include <sys/vnode.h> #include <sys/malloc.h> +#include <sys/sysproto.h> +#include <sys/mman.h> #include <miscfs/specfs/specdev.h> @@ -51,6 +53,7 @@ static int mfs_badop __P((struct vop_generic_args *)); static int mfs_bmap __P((struct vop_bmap_args *)); static int mfs_close __P((struct vop_close_args *)); static int mfs_fsync __P((struct vop_fsync_args *)); +static int mfs_freeblks __P((struct vop_freeblks_args *)); static int mfs_inactive __P((struct vop_inactive_args *)); /* XXX */ static int mfs_open __P((struct vop_open_args *)); static int mfs_reclaim __P((struct vop_reclaim_args *)); /* XXX */ @@ -66,7 +69,7 @@ static struct vnodeopv_entry_desc mfs_vnodeop_entries[] = { { &vop_bmap_desc, (vop_t *) mfs_bmap }, { &vop_bwrite_desc, (vop_t *) vop_defaultop }, { &vop_close_desc, (vop_t *) mfs_close }, - { &vop_freeblks_desc, (vop_t *) vop_defaultop }, + { &vop_freeblks_desc, (vop_t *) mfs_freeblks }, { &vop_fsync_desc, (vop_t *) mfs_fsync }, { &vop_getpages_desc, (vop_t *) mfs_getpages }, { &vop_inactive_desc, (vop_t *) mfs_inactive }, @@ -119,6 +122,38 @@ mfs_fsync(ap) } /* + * mfs_freeblks() - hook to allow us to free physical memory. + * + * We implement the B_FREEBUF strategy. We can't just madvise() + * here because we have to do it in the correct order vs other bio + * requests, so we queue it. + */ + +static int +mfs_freeblks(ap) + struct vop_freeblks_args /* { + struct vnode *a_vp; + daddr_t a_addr; + daddr_t a_length; + } */ *ap; +{ + struct buf *bp; + struct vnode *vp; + + if (!vfinddev(ap->a_vp->v_rdev, VBLK, &vp) || vp->v_usecount == 0) + panic("mfs_strategy: bad dev"); + + bp = geteblk(ap->a_length); + bp->b_flags |= B_FREEBUF | B_BUSY; + bp->b_dev = ap->a_vp->v_rdev; + bp->b_blkno = ap->a_addr; + bp->b_offset = dbtob(ap->a_addr); + bp->b_bcount = ap->a_length; + VOP_STRATEGY(vp, bp); + return(0); +} + +/* * Pass I/O requests to the memory filesystem process. */ static int @@ -132,26 +167,50 @@ mfs_strategy(ap) register struct mfsnode *mfsp; struct vnode *vp; struct proc *p = curproc; /* XXX */ + int s; if (!vfinddev(bp->b_dev, VBLK, &vp) || vp->v_usecount == 0) panic("mfs_strategy: bad dev"); mfsp = VTOMFS(vp); - /* check for mini-root access */ + + /* + * splbio required for queueing/dequeueing, in case of forwarded + * BPs from bio interrupts (??). It may not be necessary. + */ + + s = splbio(); + if (mfsp->mfs_pid == 0) { + /* + * mini-root. Note: B_FREEBUF not supported at the moment, + * I'm not sure what kind of dataspace b_data is in. + */ caddr_t base; base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_FREEBUF) + ; if (bp->b_flags & B_READ) bcopy(base, bp->b_data, bp->b_bcount); else bcopy(bp->b_data, base, bp->b_bcount); biodone(bp); } else if (mfsp->mfs_pid == p->p_pid) { - mfs_doio(bp, mfsp->mfs_baseoff); + /* + * VOP to self + */ + splx(s); + mfs_doio(bp, mfsp); + s = splbio(); } else { + /* + * VOP from some other process, queue to MFS process and + * wake it up. + */ bufq_insert_tail(&mfsp->buf_queue, bp); wakeup((caddr_t)vp); } + splx(s); return (0); } @@ -159,18 +218,59 @@ mfs_strategy(ap) * Memory file system I/O. * * Trivial on the HP since buffer has already been mapping into KVA space. + * + * Read and Write are handled with a simple copyin and copyout. + * + * We also partially support VOP_FREEBLKS() via B_FREEBUF. We can't implement + * completely -- for example, on fragments or inode metadata, but we can + * implement it for page-aligned requests. */ void -mfs_doio(bp, base) +mfs_doio(bp, mfsp) register struct buf *bp; - caddr_t base; + struct mfsnode *mfsp; { + caddr_t base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); + + if (bp->b_flags & B_FREEBUF) { + /* + * Implement B_FREEBUF, which allows the filesystem to tell + * a block device when blocks are no longer needed (like when + * a file is deleted). We use the hook to MADV_FREE the VM. + * This makes an MFS filesystem work as well or better then + * a sun-style swap-mounted filesystem. + */ + int bytes = bp->b_bcount; + + if ((vm_offset_t)base & PAGE_MASK) { + int n = PAGE_SIZE - ((vm_offset_t)base & PAGE_MASK); + bytes -= n; + base += n; + } + if (bytes > 0) { + struct madvise_args uap; - base += (bp->b_blkno << DEV_BSHIFT); - if (bp->b_flags & B_READ) + bytes &= ~PAGE_MASK; + if (bytes != 0) { + bzero(&uap, sizeof(uap)); + uap.addr = base; + uap.len = bytes; + uap.behav = MADV_FREE; + madvise(curproc, &uap); + } + } + bp->b_error = 0; + } else if (bp->b_flags & B_READ) { + /* + * Read data from our 'memory' disk + */ bp->b_error = copyin(base, bp->b_data, bp->b_bcount); - else + } else { + /* + * Write data to our 'memory' disk + */ bp->b_error = copyout(bp->b_data, base, bp->b_bcount); + } if (bp->b_error) bp->b_flags |= B_ERROR; biodone(bp); @@ -222,7 +322,7 @@ mfs_close(ap) */ while (bp = bufq_first(&mfsp->buf_queue)) { bufq_remove(&mfsp->buf_queue, bp); - mfs_doio(bp, mfsp->mfs_baseoff); + mfs_doio(bp, mfsp); wakeup((caddr_t)bp); } /* diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index 026d3486..fd3555a 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 - * $Id: ufs_readwrite.c,v 1.54 1998/12/15 03:29:52 julian Exp $ + * $Id: ufs_readwrite.c,v 1.55 1999/01/07 16:14:19 bde Exp $ */ #define BLKSIZE(a, b, c) blksize(a, b, c) @@ -392,7 +392,10 @@ WRITE(ap) panic("%s: nonsync dir write", WRITE_S); break; default: - panic("%s: type", WRITE_S); + panic("%s: type %p %d (%d,%d)", WRITE_S, vp, (int)vp->v_type, + (int)uio->uio_offset, + (int)uio->uio_resid + ); } fs = ip->I_FS; @@ -598,9 +601,8 @@ ffs_getpages(ap) vm_page_busy(m); vm_page_free(m); } else if (m == mreq) { - while (m->flags & PG_BUSY) { - vm_page_sleep(m, "ffspwt", NULL); - } + while (vm_page_sleep_busy(m, FALSE, "ffspwt")) + ; vm_page_busy(m); vp->v_lastr = m->pindex + 1; } else { diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 1010085..49e1a29 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 - * $Id: ufs_vnops.c,v 1.103 1998/12/24 09:45:10 bde Exp $ + * $Id: ufs_vnops.c,v 1.104 1999/01/07 16:14:19 bde Exp $ */ #include "opt_quota.h" @@ -1731,6 +1731,9 @@ ufs_abortop(ap) /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. + * + * In order to be able to swap to a file, the VOP_BMAP operation may not + * deadlock on memory. See ufs_bmap() for details. */ int ufs_strategy(ap) diff --git a/sys/vm/default_pager.c b/sys/vm/default_pager.c index ba92894..16b7512 100644 --- a/sys/vm/default_pager.c +++ b/sys/vm/default_pager.c @@ -28,7 +28,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: default_pager.c,v 1.15 1998/02/06 12:14:20 eivind Exp $ + * The default pager is responsible for supplying backing store to unbacked + * storage. The backing store is usually swap so we just fall through to + * the swap routines. However, since swap metadata has not been assigned, + * the swap routines assign and manage the swap backing store through the + * vm_page->swapblk field. The object is only converted when the page is + * physically freed after having been cleaned and even then vm_page->swapblk + * is maintained whenever a resident page also has swap backing store. + * + * $Id: default_pager.c,v 1.16 1998/10/13 08:24:42 dg Exp $ */ #include <sys/param.h> @@ -78,6 +86,14 @@ default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, return vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(round_page(offset + size))); } +/* + * deallocate resources associated with default objects. The default objects + * have no special resources allocated to them, but the vm_page's being used + * in this object might. Still, we do not have to do anything - we will free + * the swapblk in the underlying vm_page's when we free the vm_page or + * garbage collect the vm_page cache list. + */ + static void default_pager_dealloc(object) vm_object_t object; @@ -88,9 +104,11 @@ default_pager_dealloc(object) } /* - * The default pager has no backing store, so we always return - * failure. + * Load pages from backing store. Since OBJT_DEFAULT is converted to + * OBJT_SWAP at the time a swap-backed vm_page_t is freed, we will never + * see a vm_page with assigned swap here. */ + static int default_pager_getpages(object, m, count, reqpage) vm_object_t object; @@ -101,6 +119,13 @@ default_pager_getpages(object, m, count, reqpage) return VM_PAGER_FAIL; } +/* + * Store pages to backing store. We should assign swap and initiate + * I/O. We do not actually convert the object to OBJT_SWAP here. The + * object will be converted when the written-out vm_page_t is moved from the + * cache to the free list. + */ + static int default_pager_putpages(object, m, c, sync, rtvals) vm_object_t object; @@ -109,26 +134,22 @@ default_pager_putpages(object, m, c, sync, rtvals) boolean_t sync; int *rtvals; { - int i; - - /* - * Try to convert the object type into a OBJT_SWAP. - * If the swp structure allocation fails, convert it - * back to OBJT_DEFAULT and return failure. Otherwise - * pass this putpages to the swap pager. - */ - object->type = OBJT_SWAP; - - if (swap_pager_swp_alloc(object, M_KERNEL) != 0) { - object->type = OBJT_DEFAULT; - for (i = 0; i < c; i++) - rtvals[i] = VM_PAGER_FAIL; - return VM_PAGER_FAIL; - } - return swap_pager_putpages(object, m, c, sync, rtvals); } +/* + * Tell us whether the backing store for the requested (object,index) is + * synchronized. i.e. tell us whether we can throw the page away and + * reload it later. So, for example, if we are in the process of writing + * the page to its backing store, or if no backing store has been assigned, + * it is not yet synchronized. + * + * It is possible to have fully-synchronized swap assigned without the + * object having been converted. We just call swap_pager_haspage() to + * deal with it since it must already deal with it plus deal with swap + * meta-data structures. + */ + static boolean_t default_pager_haspage(object, pindex, before, after) vm_object_t object; @@ -139,24 +160,3 @@ default_pager_haspage(object, pindex, before, after) return FALSE; } -void -default_pager_convert_to_swap(object) - vm_object_t object; -{ - object->type = OBJT_SWAP; - if (swap_pager_swp_alloc(object, M_KERNEL) != 0) { - object->type = OBJT_DEFAULT; - } -} - -void -default_pager_convert_to_swapq(object) - vm_object_t object; -{ - if (object && - (object->type == OBJT_DEFAULT) && - (object != kernel_object && object != kmem_object) && - (object->size > ((cnt.v_page_count - cnt.v_wire_count) / 4))) - default_pager_convert_to_swap(object); -} - diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index a200b9c..cc742b0 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)device_pager.c 8.1 (Berkeley) 6/11/93 - * $Id: device_pager.c,v 1.36 1998/12/07 21:58:50 archie Exp $ + * $Id: device_pager.c,v 1.37 1999/01/08 17:31:23 eivind Exp $ */ #include <sys/param.h> @@ -200,7 +200,7 @@ dev_pager_getpages(object, m, count, reqpage) int prot; dev = (dev_t) (uintptr_t) object->handle; - offset = m[reqpage]->pindex + OFF_TO_IDX(object->paging_offset); + offset = m[reqpage]->pindex; prot = PROT_READ; /* XXX should pass in? */ mapfunc = cdevsw[major(dev)]->d_mmap; diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 1691168..b063520 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 @@ -36,17 +37,34 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * + * New Swap System + * Matthew Dillon + * + * Radix Bitmap 'blists'. + * + * - The new swapper uses the new radix bitmap code. This should scale + * to arbitrarily small or arbitrarily large swap spaces and an almost + * arbitrary degree of fragmentation. + * + * Features: + * + * - on the fly reallocation of swap during putpages. The new system + * does not try to keep previously allocated swap blocks for dirty + * pages. + * + * - on the fly deallocation of swap + * + * - No more garbage collection required. Unnecessarily allocated swap + * blocks only exist for dirty vm_page_t's now and these are already + * cycled (in a high-load system) by the pager. We also do on-the-fly + * removal of invalidated swap blocks when a page is destroyed + * or renamed. + * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 - * $Id: swap_pager.c,v 1.106 1999/01/08 17:31:23 eivind Exp $ - */ - -/* - * Quick hack to page to dedicated partition(s). - * TODO: - * Add multiprocessor locks - * Deal with async writes in a better fashion + * + * $Id: swap_pager.c,v 1.107 1999/01/10 01:58:28 eivind Exp $ */ #include <sys/param.h> @@ -57,18 +75,16 @@ #include <sys/vnode.h> #include <sys/malloc.h> #include <sys/vmmeter.h> -#include <sys/rlist.h> +#include <sys/blist.h> +#include <sys/lock.h> #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 16 #endif -#ifndef NPENDINGIO -#define NPENDINGIO 16 -#endif - -#define SWB_NPAGES MAX_PAGEOUT_CLUSTER +#define SWB_NPAGES MAX_PAGEOUT_CLUSTER +#include "opt_swap.h" #include <vm/vm.h> #include <vm/vm_prot.h> #include <vm/vm_object.h> @@ -77,848 +93,651 @@ #include <vm/vm_pageout.h> #include <vm/swap_pager.h> #include <vm/vm_extern.h> +#include <vm/vm_zone.h> -static int nswiodone; -int swap_pager_full; -extern int vm_swap_size; -static int no_swap_space = 1; -static int max_pageout_cluster; -struct rlisthdr swaplist; - -TAILQ_HEAD(swpclean, swpagerclean); - -typedef struct swpagerclean *swp_clean_t; +#define SWM_FREE 0x02 /* free, period */ +#define SWM_POP 0x04 /* pop out */ -static struct swpagerclean { - TAILQ_ENTRY(swpagerclean) spc_list; - int spc_flags; - struct buf *spc_bp; - vm_object_t spc_object; - vm_offset_t spc_kva; - int spc_first; - int spc_count; - vm_page_t spc_m[MAX_PAGEOUT_CLUSTER]; -} swcleanlist[NPENDINGIO]; - - -/* spc_flags values */ -#define SPC_ERROR 0x01 +/* + * vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks + * in the old system. + */ -#define SWB_EMPTY (-1) +extern int vm_swap_size; /* number of free swap blocks, in pages */ -/* list of completed page cleans */ -static struct swpclean swap_pager_done; +int swap_pager_full; /* swap space exhaustion (w/ hysteresis)*/ +static int nsw_rcount; /* free read buffers */ +static int nsw_wcount; /* free write buffers */ +static int nsw_hysteresis; /* hysteresis */ +static int max_pageout_cluster; /* maximum VOP I/O allowed */ +static int sw_alloc_interlock; /* swap pager allocation interlock */ -/* list of pending page cleans */ -static struct swpclean swap_pager_inuse; +struct blist *swapblist; +static struct swblock **swhash; +static int swhash_mask; -/* list of free pager clean structs */ -static struct swpclean swap_pager_free; -static int swap_pager_free_count; -static int swap_pager_free_pending; -/* list of "named" anon region objects */ -static struct pagerlst swap_pager_object_list; +/* + * "named" and "unnamed" anon region objects. Try to reduce the overhead + * of searching a named list by hashing it just a little. + */ -/* list of "unnamed" anon region objects */ -struct pagerlst swap_pager_un_object_list; +#define NOBJLISTS 8 -#define SWAP_FREE_NEEDED 0x1 /* need a swap block */ -#define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2 -static int swap_pager_needflags; +#define NOBJLIST(handle) \ + (&swap_pager_object_list[((int)(long)handle >> 4) & (NOBJLISTS-1)]) -static struct pagerlst *swp_qs[] = { - &swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0 -}; +static struct pagerlst swap_pager_object_list[NOBJLISTS]; +struct pagerlst swap_pager_un_object_list; +vm_zone_t swap_zone; /* - * pagerops for OBJT_SWAP - "swap pager". + * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure + * calls hooked from other parts of the VM system and do not appear here. + * (see vm/swap_pager.h). */ + static vm_object_t swap_pager_alloc __P((void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset)); static void swap_pager_dealloc __P((vm_object_t object)); -static boolean_t - swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex, - int *before, int *after)); static int swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static void swap_pager_init __P((void)); -static void spc_free __P((swp_clean_t)); +static void swap_pager_unswapped __P((vm_page_t)); struct pagerops swappagerops = { - swap_pager_init, - swap_pager_alloc, - swap_pager_dealloc, - swap_pager_getpages, - swap_pager_putpages, - swap_pager_haspage, - swap_pager_sync + swap_pager_init, /* early system initialization of pager */ + swap_pager_alloc, /* allocate an OBJT_SWAP object */ + swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ + swap_pager_getpages, /* pagein */ + swap_pager_putpages, /* pageout */ + swap_pager_haspage, /* get backing store status for page */ + swap_pager_unswapped /* remove swap related to page */ }; -static int npendingio; -static int dmmin; +/* + * dmmax is in page-sized chunks with the new swap system. It was + * dev-bsized chunks in the old. + * + * swap_*() routines are externally accessible. swp_*() routines are + * internal. + */ + int dmmax; +static int dmmax_mask; +int nswap_lowat = 128; /* in pages, swap_pager_full warning */ +int nswap_hiwat = 256; /* in pages, swap_pager_full warning */ + +static __inline void swp_sizecheck __P((void)); +static void swp_pager_sync_iodone __P((struct buf *bp)); +static void swp_pager_async_iodone __P((struct buf *bp)); + +/* + * Swap bitmap functions + */ + +static __inline void swp_pager_freeswapspace __P((daddr_t blk, int npages)); +static __inline daddr_t swp_pager_getswapspace __P((int npages)); + +/* + * Metadata functions + */ + +static void swp_pager_meta_build __P((vm_object_t, daddr_t, daddr_t, int)); +static void swp_pager_meta_free __P((vm_object_t, daddr_t, daddr_t)); +static void swp_pager_meta_free_all __P((vm_object_t)); +static daddr_t swp_pager_meta_ctl __P((vm_object_t, vm_pindex_t, int)); -static int swap_pager_block_index __P((vm_pindex_t pindex)); -static int swap_pager_block_offset __P((vm_pindex_t pindex)); -static daddr_t *swap_pager_diskaddr __P((vm_object_t object, - vm_pindex_t pindex, int *valid)); -static void swap_pager_finish __P((swp_clean_t spc)); -static void swap_pager_free_swap __P((vm_object_t object)); -static void swap_pager_freeswapspace __P((vm_object_t object, - unsigned int from, - unsigned int to)); -static int swap_pager_getswapspace __P((vm_object_t object, - unsigned int amount, - daddr_t *rtval)); -static void swap_pager_iodone __P((struct buf *)); -static void swap_pager_iodone1 __P((struct buf *bp)); -static void swap_pager_reclaim __P((void)); -static void swap_pager_ridpages __P((vm_page_t *m, int count, - int reqpage)); -static void swap_pager_setvalid __P((vm_object_t object, - vm_offset_t offset, int valid)); -static __inline void swapsizecheck __P((void)); - -#define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE))) +/* + * SWP_SIZECHECK() - update swap_pager_full indication + * + * update the swap_pager_full indication and warn when we are + * about to run out of swap space. + * + * No restrictions on call + * This routine may not block. + * This routine must be called at splvm() + */ static __inline void -swapsizecheck() +swp_sizecheck() { - if (vm_swap_size < 128 * btodb(PAGE_SIZE)) { + if (vm_swap_size < nswap_lowat) { if (swap_pager_full == 0) printf("swap_pager: out of swap space\n"); swap_pager_full = 1; - } else if (vm_swap_size > 192 * btodb(PAGE_SIZE)) + } else if (vm_swap_size > nswap_hiwat) { swap_pager_full = 0; + } } +/* + * SWAP_PAGER_INIT() - initialize the swap pager! + * + * Expected to be started from system init. NOTE: This code is run + * before much else so be careful what you depend on. Most of the VM + * system has yet to be initialized at this point. + */ + static void swap_pager_init() { - int maxsafepending; - TAILQ_INIT(&swap_pager_object_list); - TAILQ_INIT(&swap_pager_un_object_list); - /* - * Initialize clean lists + * Initialize object lists */ - TAILQ_INIT(&swap_pager_inuse); - TAILQ_INIT(&swap_pager_done); - TAILQ_INIT(&swap_pager_free); - swap_pager_free_count = 0; + int i; + + for (i = 0; i < NOBJLISTS; ++i) + TAILQ_INIT(&swap_pager_object_list[i]); + TAILQ_INIT(&swap_pager_un_object_list); /* - * Calculate the swap allocation constants. + * Device Stripe, in PAGE_SIZE'd blocks */ - dmmin = PAGE_SIZE / DEV_BSIZE; - dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2; - - maxsafepending = cnt.v_free_min - cnt.v_free_reserved; - npendingio = NPENDINGIO; - max_pageout_cluster = MAX_PAGEOUT_CLUSTER; - - if ((2 * NPENDINGIO * MAX_PAGEOUT_CLUSTER) > maxsafepending) { - max_pageout_cluster = MAX_PAGEOUT_CLUSTER / 2; - npendingio = maxsafepending / (2 * max_pageout_cluster); - if (npendingio < 2) - npendingio = 2; - } + + dmmax = SWB_NPAGES * 2; + dmmax_mask = ~(dmmax - 1); } +/* + * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process + * + * Expected to be started from pageout process once, prior to entering + * its main loop. + */ + void swap_pager_swap_init() { - swp_clean_t spc; - struct buf *bp; - int i; + int n; /* - * kva's are allocated here so that we dont need to keep doing - * kmem_alloc pageables at runtime + * Number of in-transit swap bp operations. Don't + * exhaust the pbufs completely. Make sure we + * initialize workable values (0 will work for hysteresis + * but it isn't very efficient). + * + * The max_pageout_cluster is constrained by the bp->b_pages[] + * array (MAXPHYS/PAGE_SIZE) and our locally defined + * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are + * constrained by the swap device interleave stripe size. */ - for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) { - spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * max_pageout_cluster); - if (!spc->spc_kva) { - break; - } - spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL); - if (!spc->spc_bp) { - kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE); - break; - } - spc->spc_flags = 0; - TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); - swap_pager_free_count++; - } -} -int -swap_pager_swp_alloc(object, wait) - vm_object_t object; - int wait; -{ - sw_blk_t swb; - int nblocks; - int i, j; - - nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES; - swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait); - if (swb == NULL) - return 1; - - for (i = 0; i < nblocks; i++) { - swb[i].swb_valid = 0; - swb[i].swb_locked = 0; - for (j = 0; j < SWB_NPAGES; j++) - swb[i].swb_block[j] = SWB_EMPTY; - } + nsw_rcount = (nswbuf + 1) / 2; + nsw_wcount = (nswbuf + 3) / 4; + nsw_hysteresis = nsw_wcount / 2; + max_pageout_cluster = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); - object->un_pager.swp.swp_nblocks = nblocks; - object->un_pager.swp.swp_allocsize = 0; - object->un_pager.swp.swp_blocks = swb; - object->un_pager.swp.swp_poip = 0; + /* + * Initialize our zone. Right now I'm just guessing on the number + * we need based on the number of pages in the system. Each swblock + * can hold 16 pages, so this is probably overkill. + */ - if (object->handle != NULL) { - TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list); - } else { - TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list); - } + n = cnt.v_page_count * 2; - return 0; + swap_zone = zinit( + "SWAPMETA", + sizeof(struct swblock), + n, + ZONE_INTERRUPT, + 1 + ); + + /* + * Initialize our meta-data hash table. The swapper does not need to + * be quite as efficient as the VM system, so we do not use an + * oversized hash table. + * + * n: size of hash table, must be power of 2 + * swhash_mask: hash table index mask + */ + + for (n = 1; n < cnt.v_page_count / 4; n <<= 1) + ; + + swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK); + bzero(swhash, sizeof(struct swblock *) * n); + + swhash_mask = n - 1; } /* - * Allocate an object and associated resources. - * Note that if we are called from the pageout daemon (handle == NULL) - * we should not wait for memory as it could resulting in deadlock. + * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate + * its metadata structures. + * + * This routine is called from the mmap and fork code to create a new + * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object + * and then converting it with swp_pager_meta_build(). + * + * This routine may block in vm_object_allocate() and create a named + * object lookup race, so we must interlock. We must also run at + * splvm() for the object lookup to handle races with interrupts, but + * we do not have to maintain splvm() in between the lookup and the + * add because (I believe) it is not possible to attempt to create + * a new swap object w/handle when a default object with that handle + * already exists. */ + static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset) { vm_object_t object; - /* - * If this is a "named" anonymous region, look it up and use the - * object if it exists, otherwise allocate a new one. - */ if (handle) { - object = vm_pager_object_lookup(&swap_pager_object_list, handle); + /* + * Reference existing named region or allocate new one. There + * should not be a race here against swp_pager_meta_build() + * as called from vm_page_remove() in regards to the lookup + * of the handle. + */ + + while (sw_alloc_interlock) { + sw_alloc_interlock = -1; + tsleep(&sw_alloc_interlock, PVM, "swpalc", 0); + } + sw_alloc_interlock = 1; + + object = vm_pager_object_lookup(NOBJLIST(handle), handle); + if (object != NULL) { vm_object_reference(object); } else { - /* - * XXX - there is a race condition here. Two processes - * can request the same named object simultaneuously, - * and if one blocks for memory, the result is a disaster. - * Probably quite rare, but is yet another reason to just - * rip support of "named anonymous regions" out altogether. - */ - object = vm_object_allocate(OBJT_SWAP, + object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(offset + PAGE_MASK + size)); object->handle = handle; - (void) swap_pager_swp_alloc(object, M_WAITOK); + + swp_pager_meta_build( + object, + 0, + SWAPBLK_NONE, + 0 + ); } + + if (sw_alloc_interlock < 0) + wakeup(&sw_alloc_interlock); + + sw_alloc_interlock = 0; } else { - object = vm_object_allocate(OBJT_SWAP, + object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(offset + PAGE_MASK + size)); - (void) swap_pager_swp_alloc(object, M_WAITOK); + + swp_pager_meta_build( + object, + 0, + SWAPBLK_NONE, + 0 + ); } return (object); } /* - * returns disk block associated with pager and offset - * additionally, as a side effect returns a flag indicating - * if the block has been written + * SWAP_PAGER_DEALLOC() - remove swap metadata from object + * + * The swap backing for the object is destroyed. The code is + * designed such that we can reinstantiate it later, but this + * routine is typically called only when the entire object is + * about to be destroyed. + * + * This routine may block, but no longer does. + * + * The object must be locked or unreferenceable. */ -static __inline daddr_t * -swap_pager_diskaddr(object, pindex, valid) +static void +swap_pager_dealloc(object) vm_object_t object; - vm_pindex_t pindex; - int *valid; { - register sw_blk_t swb; - int ix; - - if (valid) - *valid = 0; - ix = pindex / SWB_NPAGES; - if ((ix >= object->un_pager.swp.swp_nblocks) || - (pindex >= object->size)) { - return (FALSE); + /* + * Remove from list right away so lookups will fail if we block for + * pageout completion. + */ + + if (object->handle == NULL) { + TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list); + } else { + TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); } - swb = &object->un_pager.swp.swp_blocks[ix]; - ix = pindex % SWB_NPAGES; - if (valid) - *valid = swb->swb_valid & (1 << ix); - return &swb->swb_block[ix]; -} -/* - * Utility routine to set the valid (written) bit for - * a block associated with a pager and offset - */ -static void -swap_pager_setvalid(object, offset, valid) - vm_object_t object; - vm_offset_t offset; - int valid; -{ - register sw_blk_t swb; - int ix; + vm_object_pip_wait(object, "swpdea"); - ix = offset / SWB_NPAGES; - if (ix >= object->un_pager.swp.swp_nblocks) - return; + /* + * Free all remaining metadata. We only bother to free it from + * the swap meta data. We do not attempt to free swapblk's still + * associated with vm_page_t's for this object. We do not care + * if paging is still in progress on some objects. + */ - swb = &object->un_pager.swp.swp_blocks[ix]; - ix = offset % SWB_NPAGES; - if (valid) - swb->swb_valid |= (1 << ix); - else - swb->swb_valid &= ~(1 << ix); - return; + swp_pager_meta_free_all(object); } +/************************************************************************ + * SWAP PAGER BITMAP ROUTINES * + ************************************************************************/ + /* - * this routine allocates swap space with a fragmentation - * minimization policy. + * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space + * + * Allocate swap for the requested number of pages. The starting + * swap block number (a page index) is returned or SWAPBLK_NONE + * if the allocation failed. + * + * Also has the side effect of advising that somebody made a mistake + * when they configured swap and didn't configure enough. + * + * Must be called at splvm() to avoid races with bitmap frees from + * vm_page_remove() aka swap_pager_page_removed(). + * + * This routine may not block + * This routine must be called at splvm(). */ -static int -swap_pager_getswapspace(object, amount, rtval) - vm_object_t object; - unsigned int amount; - daddr_t *rtval; + +static __inline daddr_t +swp_pager_getswapspace(npages) + int npages; { - unsigned location; + daddr_t blk; - vm_swap_size -= amount; - - if (!rlist_alloc(&swaplist, amount, &location)) { - vm_swap_size += amount; - return 0; + if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) { + printf("swap_pager_getswapspace: failed\n"); } else { - swapsizecheck(); - object->un_pager.swp.swp_allocsize += amount; - *rtval = location; - return 1; + vm_swap_size -= npages; + swp_sizecheck(); } + return(blk); } /* - * this routine frees swap space with a fragmentation - * minimization policy. + * SWP_PAGER_FREESWAPSPACE() - free raw swap space + * + * This routine returns the specified swap blocks back to the bitmap. + * + * Note: This routine may not block (it could in the old swap code), + * and through the use of the new blist routines it does not block. + * + * We must be called at splvm() to avoid races with bitmap frees from + * vm_page_remove() aka swap_pager_page_removed(). + * + * This routine may not block + * This routine must be called at splvm(). */ -static void -swap_pager_freeswapspace(object, from, to) - vm_object_t object; - unsigned int from; - unsigned int to; + +static __inline void +swp_pager_freeswapspace(blk, npages) + daddr_t blk; + int npages; { - rlist_free(&swaplist, from, to); - vm_swap_size += (to - from) + 1; - object->un_pager.swp.swp_allocsize -= (to - from) + 1; - swapsizecheck(); + blist_free(swapblist, blk, npages); + vm_swap_size += npages; + swp_sizecheck(); } + /* - * this routine frees swap blocks from a specified pager + * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page + * range within an object. + * + * This is a globally accessible routine. + * + * This routine removes swapblk assignments from swap metadata. + * + * The external callers of this routine typically have already destroyed + * or renamed vm_page_t's associated with this range in the object so + * we should be ok. */ + void swap_pager_freespace(object, start, size) vm_object_t object; vm_pindex_t start; vm_size_t size; { - vm_pindex_t i; - int s; - - s = splvm(); - for (i = start; i < start + size; i += 1) { - int valid; - daddr_t *addr = swap_pager_diskaddr(object, i, &valid); - - if (addr && *addr != SWB_EMPTY) { - swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1); - if (valid) { - swap_pager_setvalid(object, i, 0); - } - *addr = SWB_EMPTY; - } - } - splx(s); + swp_pager_meta_free(object, start, size); } /* - * same as freespace, but don't free, just force a DMZ next time - */ -void -swap_pager_dmzspace(object, start, size) - vm_object_t object; - vm_pindex_t start; - vm_size_t size; -{ - vm_pindex_t i; - int s; - - s = splvm(); - for (i = start; i < start + size; i += 1) { - int valid; - daddr_t *addr = swap_pager_diskaddr(object, i, &valid); - - if (addr && *addr != SWB_EMPTY) { - if (valid) { - swap_pager_setvalid(object, i, 0); - } - } - } - splx(s); -} - -static void -swap_pager_free_swap(object) - vm_object_t object; -{ - register int i, j; - register sw_blk_t swb; - int first_block=0, block_count=0; - int s; - /* - * Free left over swap blocks - */ - swb = object->un_pager.swp.swp_blocks; - if (swb == NULL) { - return; - } - - s = splvm(); - for (i = 0; i < object->un_pager.swp.swp_nblocks; i++, swb++) { - for (j = 0; j < SWB_NPAGES; j++) { - if (swb->swb_block[j] != SWB_EMPTY) { - /* - * initially the length of the run is zero - */ - if (block_count == 0) { - first_block = swb->swb_block[j]; - block_count = btodb(PAGE_SIZE); - swb->swb_block[j] = SWB_EMPTY; - /* - * if the new block can be included into the current run - */ - } else if (swb->swb_block[j] == first_block + block_count) { - block_count += btodb(PAGE_SIZE); - swb->swb_block[j] = SWB_EMPTY; - /* - * terminate the previous run, and start a new one - */ - } else { - swap_pager_freeswapspace(object, first_block, - (unsigned) first_block + block_count - 1); - first_block = swb->swb_block[j]; - block_count = btodb(PAGE_SIZE); - swb->swb_block[j] = SWB_EMPTY; - } - } - } - } - - if (block_count) { - swap_pager_freeswapspace(object, first_block, - (unsigned) first_block + block_count - 1); - } - splx(s); -} - - -/* - * swap_pager_reclaim frees up over-allocated space from all pagers - * this eliminates internal fragmentation due to allocation of space - * for segments that are never swapped to. It has been written so that - * it does not block until the rlist_free operation occurs; it keeps - * the queues consistant. - */ - -/* - * Maximum number of blocks (pages) to reclaim per pass - */ -#define MAXRECLAIM 128 - -static void -swap_pager_reclaim() -{ - vm_object_t object; - int i, j, k; - int s; - int reclaimcount; - static struct { - int address; - vm_object_t object; - } reclaims[MAXRECLAIM]; - static int in_reclaim; - - /* - * allow only one process to be in the swap_pager_reclaim subroutine - */ - s = splvm(); - if (in_reclaim) { - tsleep(&in_reclaim, PSWP, "swrclm", 0); - splx(s); - return; - } - in_reclaim = 1; - reclaimcount = 0; - - /* for each pager queue */ - for (k = 0; swp_qs[k]; k++) { - - object = TAILQ_FIRST(swp_qs[k]); - while (object && (reclaimcount < MAXRECLAIM)) { - - /* - * see if any blocks associated with a pager has been - * allocated but not used (written) - */ - if ((object->flags & OBJ_DEAD) == 0 && - (object->paging_in_progress == 0)) { - for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) { - sw_blk_t swb = &object->un_pager.swp.swp_blocks[i]; - - if (swb->swb_locked) - continue; - for (j = 0; j < SWB_NPAGES; j++) { - if (swb->swb_block[j] != SWB_EMPTY && - (swb->swb_valid & (1 << j)) == 0) { - reclaims[reclaimcount].address = swb->swb_block[j]; - reclaims[reclaimcount++].object = object; - swb->swb_block[j] = SWB_EMPTY; - if (reclaimcount >= MAXRECLAIM) - goto rfinished; - } - } - } - } - object = TAILQ_NEXT(object, pager_object_list); - } - } - -rfinished: - - /* - * free the blocks that have been added to the reclaim list - */ - for (i = 0; i < reclaimcount; i++) { - swap_pager_freeswapspace(reclaims[i].object, - reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1); - } - splx(s); - in_reclaim = 0; - wakeup(&in_reclaim); -} - - -/* - * swap_pager_copy copies blocks from one pager to another and - * destroys the source pager + * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager + * and destroy the source. + * + * Copy any valid swapblks from the source to the destination. In + * cases where both the source and destination have a valid swapblk, + * we keep the destination's. + * + * This routine is allowed to block. It may block allocating metadata + * indirectly through swp_pager_meta_build() or if paging is still in + * progress on the source. + * + * XXX vm_page_collapse() kinda expects us not to block because we + * supposedly do not need to allocate memory, but for the moment we + * *may* have to get a little memory from the zone allocator, but + * it is taken from the interrupt memory. We should be ok. + * + * The source object contains no vm_page_t's (which is just as well) + * + * The source object is of type OBJT_SWAP. + * + * The source and destination objects must be + * locked or inaccessible (XXX are they ???) */ void -swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset, - offset, destroysource) +swap_pager_copy(srcobject, dstobject, offset, destroysource) vm_object_t srcobject; - vm_pindex_t srcoffset; vm_object_t dstobject; - vm_pindex_t dstoffset; vm_pindex_t offset; int destroysource; { vm_pindex_t i; - int origsize; - int s; - - if (vm_swap_size) - no_swap_space = 0; - - origsize = srcobject->un_pager.swp.swp_allocsize; /* - * remove the source object from the swap_pager internal queue + * If destroysource is set, we remove the source object from the + * swap_pager internal queue now. */ + if (destroysource) { if (srcobject->handle == NULL) { - TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list); + TAILQ_REMOVE( + &swap_pager_un_object_list, + srcobject, + pager_object_list + ); } else { - TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list); + TAILQ_REMOVE( + NOBJLIST(srcobject->handle), + srcobject, + pager_object_list + ); } } - s = splvm(); - while (srcobject->un_pager.swp.swp_poip) { - tsleep(srcobject, PVM, "spgout", 0); - } - /* - * clean all of the pages that are currently active and finished + * transfer source to destination. */ - if (swap_pager_free_pending) - swap_pager_sync(); - /* - * transfer source to destination - */ - for (i = 0; i < dstobject->size; i += 1) { - int srcvalid, dstvalid; - daddr_t *srcaddrp = swap_pager_diskaddr(srcobject, - i + offset + srcoffset, &srcvalid); - daddr_t *dstaddrp; + for (i = 0; i < dstobject->size; ++i) { + daddr_t dstaddr; /* - * see if the source has space allocated + * Locate (without changing) the swapblk on the destination, + * unless it is invalid in which case free it silently, or + * if the destination is a resident page, in which case the + * source is thrown away. */ - if (srcaddrp && *srcaddrp != SWB_EMPTY) { + + dstaddr = swp_pager_meta_ctl(dstobject, i, 0); + + if (dstaddr == SWAPBLK_NONE) { /* - * if the source is valid and the dest has no space, - * then copy the allocation from the srouce to the - * dest. + * Destination has no swapblk and is not resident, + * copy source. */ - if (srcvalid) { - dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset, - &dstvalid); - /* - * if the dest already has a valid block, - * deallocate the source block without - * copying. - */ - if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) { - swap_pager_freeswapspace(dstobject, *dstaddrp, - *dstaddrp + btodb(PAGE_SIZE) - 1); - *dstaddrp = SWB_EMPTY; - } - if (dstaddrp && *dstaddrp == SWB_EMPTY) { - *dstaddrp = *srcaddrp; - *srcaddrp = SWB_EMPTY; - dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE); - srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE); - swap_pager_setvalid(dstobject, i + dstoffset, 1); - } - } + daddr_t srcaddr; + + srcaddr = swp_pager_meta_ctl( + srcobject, + i + offset, + SWM_POP + ); + + if (srcaddr != SWAPBLK_NONE) + swp_pager_meta_build(dstobject, i, srcaddr, 1); + } else { /* - * if the source is not empty at this point, then - * deallocate the space. + * Destination has valid swapblk or it is represented + * by a resident page. We destroy the sourceblock. */ - if (*srcaddrp != SWB_EMPTY) { - swap_pager_freeswapspace(srcobject, *srcaddrp, - *srcaddrp + btodb(PAGE_SIZE) - 1); - *srcaddrp = SWB_EMPTY; - } + + swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); } } - splx(s); /* - * Free left over swap blocks + * Free left over swap blocks in source. + * + * We have to revert the type to OBJT_DEFAULT so we do not accidently + * double-remove the object from the swap queues. */ - if (destroysource) { - swap_pager_free_swap(srcobject); - if (srcobject->un_pager.swp.swp_allocsize) { - printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n", - srcobject->un_pager.swp.swp_allocsize, origsize); - } - - free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA); - srcobject->un_pager.swp.swp_blocks = NULL; + if (destroysource) { + swp_pager_meta_free_all(srcobject); + /* + * Reverting the type is not necessary, the caller is going + * to destroy srcobject directly, but I'm doing it here + * for consistancy since we've removed the object from its + * queues. + */ + srcobject->type = OBJT_DEFAULT; } return; } -static void -swap_pager_dealloc(object) +/* + * SWAP_PAGER_HASPAGE() - determine if we have good backing store for + * the requested page. + * + * We determine whether good backing store exists for the requested + * page and return TRUE if it does, FALSE if it doesn't. + * + * If TRUE, we also try to determine how much valid, contiguous backing + * store exists before and after the requested page within a reasonable + * distance. We do not try to restrict it to the swap device stripe + * (that is handled in getpages/putpages). It probably isn't worth + * doing here. + */ + +boolean_t +swap_pager_haspage(object, pindex, before, after) vm_object_t object; + vm_pindex_t pindex; + int *before; + int *after; { - int s; - sw_blk_t swb; + daddr_t blk0; /* - * Remove from list right away so lookups will fail if we block for - * pageout completion. + * do we have good backing store at the requested index ? */ - if (object->handle == NULL) { - TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list); - } else { - TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list); - } - /* - * Wait for all pageouts to finish and remove all entries from - * cleaning list. - */ + blk0 = swp_pager_meta_ctl(object, pindex, 0); - s = splvm(); - while (object->un_pager.swp.swp_poip) { - tsleep(object, PVM, "swpout", 0); + if (blk0 & SWAPBLK_NONE) { + if (before) + *before = 0; + if (after) + *after = 0; + return (FALSE); } - splx(s); - - if (swap_pager_free_pending) - swap_pager_sync(); /* - * Free left over swap blocks + * find backwards-looking contiguous good backing store */ - swap_pager_free_swap(object); - if (object->un_pager.swp.swp_allocsize) { - printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n", - object->un_pager.swp.swp_allocsize); - } - swb = object->un_pager.swp.swp_blocks; - if (swb) { - /* - * Free swap management resources - */ - free(swb, M_VMPGDATA); - object->un_pager.swp.swp_blocks = NULL; - } -} + if (before != NULL) { + int i; -static __inline int -swap_pager_block_index(pindex) - vm_pindex_t pindex; -{ - return (pindex / SWB_NPAGES); -} - -static __inline int -swap_pager_block_offset(pindex) - vm_pindex_t pindex; -{ - return (pindex % SWB_NPAGES); -} + for (i = 1; i < (SWB_NPAGES/2); ++i) { + daddr_t blk; -/* - * swap_pager_haspage returns TRUE if the pager has data that has - * been written out. - */ -static boolean_t -swap_pager_haspage(object, pindex, before, after) - vm_object_t object; - vm_pindex_t pindex; - int *before; - int *after; -{ - register sw_blk_t swb; - int ix; - - if (before != NULL) - *before = 0; - if (after != NULL) - *after = 0; - ix = pindex / SWB_NPAGES; - if (ix >= object->un_pager.swp.swp_nblocks) { - return (FALSE); + if (i > pindex) + break; + blk = swp_pager_meta_ctl(object, pindex - i, 0); + if (blk & SWAPBLK_NONE) + break; + if (blk != blk0 - i) + break; + } + *before = (i - 1); } - swb = &object->un_pager.swp.swp_blocks[ix]; - ix = pindex % SWB_NPAGES; - - if (swb->swb_block[ix] != SWB_EMPTY) { - - if (swb->swb_valid & (1 << ix)) { - int tix; - if (before) { - for(tix = ix - 1; tix >= 0; --tix) { - if ((swb->swb_valid & (1 << tix)) == 0) - break; - if ((swb->swb_block[tix] + - (ix - tix) * (PAGE_SIZE/DEV_BSIZE)) != - swb->swb_block[ix]) - break; - (*before)++; - } - } - if (after) { - for(tix = ix + 1; tix < SWB_NPAGES; tix++) { - if ((swb->swb_valid & (1 << tix)) == 0) - break; - if ((swb->swb_block[tix] - - (tix - ix) * (PAGE_SIZE/DEV_BSIZE)) != - swb->swb_block[ix]) - break; - (*after)++; - } - } + /* + * find forward-looking contiguous good backing store + */ - return TRUE; + if (after != NULL) { + int i; + + for (i = 1; i < (SWB_NPAGES/2); ++i) { + daddr_t blk; + + blk = swp_pager_meta_ctl(object, pindex + i, 0); + if (blk & SWAPBLK_NONE) + break; + if (blk != blk0 + i) + break; } + *after = (i - 1); } - return (FALSE); -} -/* - * Wakeup based upon spc state - */ -static void -spc_wakeup(void) -{ - if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) { - swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT; - wakeup(&swap_pager_needflags); - } else if ((swap_pager_needflags & SWAP_FREE_NEEDED) && - swap_pager_free_count >= ((2 * npendingio) / 3)) { - swap_pager_needflags &= ~SWAP_FREE_NEEDED; - wakeup(&swap_pager_free); - } + return (TRUE); } /* - * Free an spc structure + * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page + * + * This removes any associated swap backing store, whether valid or + * not, from the page. + * + * This routine is typically called when a page is made dirty, at + * which point any associated swap can be freed. MADV_FREE also + * calls us in a special-case situation + * + * NOTE!!! If the page is clean and the swap was valid, the caller + * should make the page dirty before calling this routine. This routine + * does NOT change the m->dirty status of the page. Also: MADV_FREE + * depends on it. + * + * This routine may not block */ -static void -spc_free(spc) - swp_clean_t spc; -{ - spc->spc_flags = 0; - TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); - swap_pager_free_count++; - if (swap_pager_needflags) { - spc_wakeup(); - } -} -/* - * swap_pager_ridpages is a convienience routine that deallocates all - * but the required page. this is usually used in error returns that - * need to invalidate the "extra" readahead pages. - */ static void -swap_pager_ridpages(m, count, reqpage) - vm_page_t *m; - int count; - int reqpage; +swap_pager_unswapped(m) + vm_page_t m; { - int i; - - for (i = 0; i < count; i++) { - if (i != reqpage) { - vm_page_free(m[i]); - } - } + swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); } /* - * swap_pager_iodone1 is the completion routine for both reads and async writes + * SWAP_PAGER_GETPAGES() - bring pages in from swap + * + * Attempt to retrieve (m, count) pages from backing store, but make + * sure we retrieve at least m[reqpage]. We try to load in as large + * a chunk surrounding m[reqpage] as is contiguous in swap and which + * belongs to the same object. + * + * The code is designed for asynchronous operation and + * immediate-notification of 'reqpage' but tends not to be + * used that way. Please do not optimize-out this algorithmic + * feature, I intend to improve on it in the future. + * + * The parent has a single vm_object_pip_add() reference prior to + * calling us and we should return with the same. + * + * The parent has BUSY'd the pages. We should return with 'm' + * left busy, but the others adjusted. */ -static void -swap_pager_iodone1(bp) - struct buf *bp; -{ - bp->b_flags |= B_DONE; - bp->b_flags &= ~B_ASYNC; - wakeup(bp); -} static int swap_pager_getpages(object, m, count, reqpage) @@ -926,208 +745,235 @@ swap_pager_getpages(object, m, count, reqpage) vm_page_t *m; int count, reqpage; { - register struct buf *bp; - sw_blk_t swb[count]; - register int s; + struct buf *bp; + vm_page_t mreq; + int s; int i; - boolean_t rv; - vm_offset_t kva, off[count]; - vm_pindex_t paging_offset; - int reqaddr[count]; - int sequential; - - int first, last; - int failed; - int reqdskregion; - - object = m[reqpage]->object; - paging_offset = OFF_TO_IDX(object->paging_offset); - sequential = (m[reqpage]->pindex == (object->last_read + 1)); - - for (i = 0; i < count; i++) { - vm_pindex_t fidx = m[i]->pindex + paging_offset; - int ix = swap_pager_block_index(fidx); - - if (ix >= object->un_pager.swp.swp_nblocks) { - int j; - - if (i <= reqpage) { - swap_pager_ridpages(m, count, reqpage); - return (VM_PAGER_FAIL); - } - for (j = i; j < count; j++) { - vm_page_free(m[j]); - } - count = i; + int j; + daddr_t blk; + vm_offset_t kva; + vm_pindex_t lastpindex; + + mreq = m[reqpage]; + +#if !defined(MAX_PERF) + if (mreq->object != object) { + panic("swap_pager_getpages: object mismatch %p/%p", + object, + mreq->object + ); + } +#endif + /* + * Calculate range to retrieve. The pages have already been assigned + * their swapblks. We require a *contiguous* range that falls entirely + * within a single device stripe. If we do not supply it, bad things + * happen. + */ + + + blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); + + for (i = reqpage - 1; i >= 0; --i) { + daddr_t iblk; + + iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0); + if (iblk & SWAPBLK_NONE) + break; + + if ((blk ^ iblk) & dmmax_mask) + break; + + if (blk != iblk + (reqpage - i)) break; - } - swb[i] = &object->un_pager.swp.swp_blocks[ix]; - off[i] = swap_pager_block_offset(fidx); - reqaddr[i] = swb[i]->swb_block[off[i]]; } + ++i; - /* make sure that our required input request is existant */ + for (j = reqpage + 1; j < count; ++j) { + daddr_t jblk; - if (reqaddr[reqpage] == SWB_EMPTY || - (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) { - swap_pager_ridpages(m, count, reqpage); - return (VM_PAGER_FAIL); + jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0); + if (jblk & SWAPBLK_NONE) + break; + + if ((blk ^ jblk) & dmmax_mask) + break; + + if (blk != jblk - (j - reqpage)) + break; } - reqdskregion = reqaddr[reqpage] / dmmax; /* - * search backwards for the first contiguous page to transfer + * If blk itself is bad, well, we can't do any I/O. This should + * already be covered as a side effect, but I'm making sure. */ - failed = 0; - first = 0; - for (i = reqpage - 1; i >= 0; --i) { - if (sequential || failed || (reqaddr[i] == SWB_EMPTY) || - (swb[i]->swb_valid & (1 << off[i])) == 0 || - (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || - ((reqaddr[i] / dmmax) != reqdskregion)) { - failed = 1; - vm_page_free(m[i]); - if (first == 0) - first = i + 1; - } + + if (blk & SWAPBLK_NONE) { + i = reqpage; + j = reqpage + 1; } + /* - * search forwards for the last contiguous page to transfer + * free pages outside our collection range. Note: we never free + * mreq, it must remain busy throughout. */ - failed = 0; - last = count; - for (i = reqpage + 1; i < count; i++) { - if (failed || (reqaddr[i] == SWB_EMPTY) || - (swb[i]->swb_valid & (1 << off[i])) == 0 || - (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || - ((reqaddr[i] / dmmax) != reqdskregion)) { - failed = 1; - vm_page_free(m[i]); - if (last == count) - last = i; - } - } - count = last; - if (first != 0) { - for (i = first; i < count; i++) { - m[i - first] = m[i]; - reqaddr[i - first] = reqaddr[i]; - off[i - first] = off[i]; + { + int k; + + for (k = 0; k < i; ++k) { + vm_page_free(m[k]); + } + for (k = j; k < count; ++k) { + vm_page_free(m[k]); } - count -= first; - reqpage -= first; } - ++swb[reqpage]->swb_locked; /* - * at this point: "m" is a pointer to the array of vm_page_t for - * paging I/O "count" is the number of vm_page_t entries represented - * by "m" "object" is the vm_object_t for I/O "reqpage" is the index - * into "m" for the page actually faulted + * Return VM_PAGER_FAIL if we have nothing + * to do. Return mreq still busy, but the + * others unbusied. */ + if (blk & SWAPBLK_NONE) + return(VM_PAGER_FAIL); + + /* * Get a swap buffer header to perform the IO */ - bp = getpbuf(); + + bp = getpbuf(&nsw_rcount); kva = (vm_offset_t) bp->b_data; /* * map our page(s) into kva for input + * + * NOTE: B_PAGING is set by pbgetvp() */ - pmap_qenter(kva, m, count); - bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING; - bp->b_iodone = swap_pager_iodone1; + pmap_qenter(kva, m + i, j - i); + + bp->b_flags = B_BUSY | B_READ | B_CALL; + bp->b_iodone = swp_pager_async_iodone; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; crhold(bp->b_rcred); crhold(bp->b_wcred); bp->b_data = (caddr_t) kva; - bp->b_blkno = reqaddr[0]; - bp->b_bcount = PAGE_SIZE * count; - bp->b_bufsize = PAGE_SIZE * count; + /* + * b_blkno is in page-sized chunks. swapblk is valid, too, so + * we don't have to mask it against SWAPBLK_MASK. + */ + bp->b_blkno = blk - (reqpage - i); + bp->b_bcount = PAGE_SIZE * (j - i); + bp->b_bufsize = PAGE_SIZE * (j - i); + bp->b_pager.pg_reqpage = reqpage - i; + + { + int k; + + for (k = i; k < j; ++k) { + bp->b_pages[k - i] = m[k]; + vm_page_flag_set(m[k], PG_SWAPINPROG); + } + } + bp->b_npages = j - i; pbgetvp(swapdev_vp, bp); cnt.v_swapin++; - cnt.v_swappgsin += count; + cnt.v_swappgsin += bp->b_npages; + + /* + * We still hold the lock on mreq, and our automatic completion routine + * does not remove it. + */ + + vm_object_pip_add(mreq->object, bp->b_npages); + lastpindex = m[j-1]->pindex; + /* - * perform the I/O + * perform the I/O. NOTE!!! bp cannot be considered valid after + * this point because we automatically release it on completion. + * Instead, we look at the one page we are interested in which we + * still hold a lock on even through the I/O completion. + * + * The other pages in our m[] array are also released on completion, + * so we cannot assume they are valid anymore either. + * + * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY */ + VOP_STRATEGY(bp->b_vp, bp); /* - * wait for the sync I/O to complete + * wait for the page we want to complete. PG_SWAPINPROG is always + * cleared on completion. If an I/O error occurs, SWAPBLK_NONE + * is set in the meta-data. */ + s = splvm(); - while ((bp->b_flags & B_DONE) == 0) { - if (tsleep(bp, PVM, "swread", hz*20)) { + + while ((mreq->flags & PG_SWAPINPROG) != 0) { + vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED); + cnt.v_intrans++; + if (tsleep(mreq, PSWP, "swread", hz*20)) { printf( -"swap_pager: indefinite wait buffer: device: %#lx, blkno: %ld, size: %ld\n", + "swap_pager: indefinite wait buffer: device:" + " %#lx, blkno: %ld, size: %ld\n", (u_long)bp->b_dev, (long)bp->b_blkno, - (long)bp->b_bcount); + (long)bp->b_bcount + ); } } - if (bp->b_flags & B_ERROR) { - printf( -"swap_pager: I/O error - pagein failed; blkno %ld, size %ld, error %d\n", - (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error); - rv = VM_PAGER_ERROR; - } else { - rv = VM_PAGER_OK; - } - splx(s); - swb[reqpage]->swb_locked--; - - /* - * remove the mapping for kernel virtual - */ - pmap_qremove(kva, count); /* - * release the physical I/O buffer - */ - relpbuf(bp); - /* - * finish up input if everything is ok + * mreq is left bussied after completion, but all the other pages + * are freed. If we had an unrecoverable read error the page will + * not be valid. */ - if (rv == VM_PAGER_OK) { - for (i = 0; i < count; i++) { - m[i]->dirty = 0; - vm_page_flag_clear(m[i], PG_ZERO); - if (i != reqpage) { - /* - * whether or not to leave the page - * activated is up in the air, but we - * should put the page on a page queue - * somewhere. (it already is in the - * object). After some emperical - * results, it is best to deactivate - * the readahead pages. - */ - vm_page_deactivate(m[i]); - /* - * just in case someone was asking for - * this page we now tell them that it - * is ok to use - */ - m[i]->valid = VM_PAGE_BITS_ALL; - vm_page_wakeup(m[i]); - } - } - - m[reqpage]->object->last_read = m[count-1]->pindex; + if (mreq->valid != VM_PAGE_BITS_ALL) { + return(VM_PAGER_ERROR); } else { - swap_pager_ridpages(m, count, reqpage); + mreq->object->last_read = lastpindex; + return(VM_PAGER_OK); } - return (rv); + + /* + * A final note: in a low swap situation, we cannot deallocate swap + * and mark a page dirty here because the caller is likely to mark + * the page clean when we return, causing the page to possibly revert + * to all-zero's later. + */ } +/* + * swap_pager_putpages: + * + * Assign swap (if necessary) and initiate I/O on the specified pages. + * + * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects + * are automatically converted to SWAP objects. + * + * In a low memory situation we may block in VOP_STRATEGY(), but the new + * vm_page reservation system coupled with properly written VFS devices + * should ensure that no low-memory deadlock occurs. This is an area + * which needs work. + * + * The parent has N vm_object_pip_add() references prior to + * calling us and will remove references for rtvals[] that are + * not set to VM_PAGER_PEND. We need to remove the rest on I/O + * completion. + * + * The parent has soft-busy'd the pages it passes us and will unbusy + * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. + * We need to unbusy the rest on I/O completion. + */ + int swap_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; @@ -1136,534 +982,749 @@ swap_pager_putpages(object, m, count, sync, rtvals) boolean_t sync; int *rtvals; { - register struct buf *bp; - sw_blk_t swb[count]; - register int s; - int i, j, ix, firstidx, lastidx; - boolean_t rv; - vm_offset_t kva, off, fidx; - swp_clean_t spc; - vm_pindex_t paging_pindex; - int reqaddr[count]; - int failed; - - if (vm_swap_size) - no_swap_space = 0; - - if (no_swap_space) { - for (i = 0; i < count; i++) - rtvals[i] = VM_PAGER_FAIL; - return VM_PAGER_FAIL; + int i; + int n = 0; + int grv = VM_PAGER_OK; + +#if !defined(MAX_PERF) + if (count && m[0]->object != object) { + panic("swap_pager_getpages: object mismatch %p/%p", + object, + m[0]->object + ); + } +#endif + /* + * Step 1 + * + * Turn object into OBJT_SWAP + * check for bogus sysops + * force sync if not pageout process + */ + + if (object->type != OBJT_SWAP) { + swp_pager_meta_build(object, 0, SWAPBLK_NONE, 0); } if (curproc != pageproc) sync = TRUE; - object = m[0]->object; - paging_pindex = OFF_TO_IDX(object->paging_offset); - - failed = 0; - for (j = 0; j < count; j++) { - fidx = m[j]->pindex + paging_pindex; - ix = swap_pager_block_index(fidx); - swb[j] = 0; - if (ix >= object->un_pager.swp.swp_nblocks) { - rtvals[j] = VM_PAGER_FAIL; - failed = 1; - continue; - } else { - rtvals[j] = VM_PAGER_OK; - } - swb[j] = &object->un_pager.swp.swp_blocks[ix]; - swb[j]->swb_locked++; - if (failed) { - rtvals[j] = VM_PAGER_FAIL; - continue; - } - off = swap_pager_block_offset(fidx); - reqaddr[j] = swb[j]->swb_block[off]; - if (reqaddr[j] == SWB_EMPTY) { - daddr_t blk; - int tries; - int ntoget; + /* + * Step 2 + * + * Assign swap blocks and issue I/O. We reallocate swap on the fly. + * The page is left dirty until the pageout operation completes + * successfully. + */ - tries = 0; - s = splvm(); + for (i = 0; i < count; i += n) { + int s; + int j; + struct buf *bp; + daddr_t blk; - /* - * if any other pages have been allocated in this - * block, we only try to get one page. - */ - for (i = 0; i < SWB_NPAGES; i++) { - if (swb[j]->swb_block[i] != SWB_EMPTY) - break; - } + /* + * Maximum I/O size is limited by a number of factors. + */ - ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1; - /* - * this code is alittle conservative, but works (the - * intent of this code is to allocate small chunks for - * small objects) - */ - if ((off == 0) && ((fidx + ntoget) > object->size)) { - ntoget = object->size - fidx; - } - retrygetspace: - if (!swap_pager_full && ntoget > 1 && - swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE), - &blk)) { - - for (i = 0; i < ntoget; i++) { - swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i; - swb[j]->swb_valid = 0; - } + n = min(BLIST_MAX_ALLOC, count - i); + n = min(n, max_pageout_cluster); - reqaddr[j] = swb[j]->swb_block[off]; - } else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE), - &swb[j]->swb_block[off])) { - /* - * if the allocation has failed, we try to - * reclaim space and retry. - */ - if (++tries == 1) { - swap_pager_reclaim(); - goto retrygetspace; - } - rtvals[j] = VM_PAGER_AGAIN; - failed = 1; - swap_pager_full = 1; - } else { - reqaddr[j] = swb[j]->swb_block[off]; - swb[j]->swb_valid &= ~(1 << off); + /* + * Get biggest block of swap we can. If we fail, fall + * back and try to allocate a smaller block. Don't go + * overboard trying to allocate space if it would overly + * fragment swap. + */ + while ( + (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && + n > 4 + ) { + n >>= 1; + } + if (blk == SWAPBLK_NONE) { + for (j = 0; j < n; ++j) { + rtvals[i+j] = VM_PAGER_FAIL; } - splx(s); + grv = VM_PAGER_FAIL; + continue; } - } - /* - * search forwards for the last contiguous page to transfer - */ - failed = 0; - for (i = 0; i < count; i++) { - if (failed || - (reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) || - ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) || - (rtvals[i] != VM_PAGER_OK)) { - failed = 1; - if (rtvals[i] == VM_PAGER_OK) - rtvals[i] = VM_PAGER_AGAIN; + /* + * Oops, too big if it crosses a stripe + * + * 1111000000 + * 111111 + * 1000001 + */ + if ((blk ^ (blk + n)) & dmmax_mask) { + j = ((blk + dmmax) & dmmax_mask) - blk; + swp_pager_freeswapspace(blk + j, n - j); + n = j; } - } - ix = 0; - firstidx = -1; - for (i = 0; i < count; i++) { - if (rtvals[i] == VM_PAGER_OK) { - ix++; - if (firstidx == -1) { - firstidx = i; - } - } else if (firstidx >= 0) { - break; - } - } + /* + * All I/O parameters have been satisfied, build the I/O + * request and assign the swap space. + * + * NOTE: B_PAGING is set by pbgetvp() + */ - if (firstidx == -1) { - for (i = 0; i < count; i++) { - if (rtvals[i] == VM_PAGER_OK) - rtvals[i] = VM_PAGER_AGAIN; - } - return VM_PAGER_AGAIN; - } + bp = getpbuf(&nsw_wcount); + bp->b_spc = NULL; /* not used, but NULL-out anyway */ - lastidx = firstidx + ix; + pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); - if (ix > max_pageout_cluster) { - for (i = firstidx + max_pageout_cluster; i < lastidx; i++) { - if (rtvals[i] == VM_PAGER_OK) - rtvals[i] = VM_PAGER_AGAIN; - } - ix = max_pageout_cluster; - lastidx = firstidx + ix; - } + bp->b_flags = B_BUSY | B_ASYNC; + bp->b_proc = &proc0; /* XXX (but without B_PHYS this is ok) */ + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; - for (i = 0; i < firstidx; i++) { - if (swb[i]) - swb[i]->swb_locked--; - } + if (bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if (bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + pbgetvp(swapdev_vp, bp); - for (i = lastidx; i < count; i++) { - if (swb[i]) - swb[i]->swb_locked--; - } + bp->b_bcount = PAGE_SIZE * n; + bp->b_bufsize = PAGE_SIZE * n; + bp->b_blkno = blk; -#ifdef INVARIANTS - for (i = firstidx; i < lastidx; i++) { - if (reqaddr[i] == SWB_EMPTY) { - printf("I/O to empty block???? -- pindex: %d, i: %d\n", - m[i]->pindex, i); - } - } -#endif + s = splvm(); - /* - * Clean up all completed async pageouts. - */ - if (swap_pager_free_pending) - swap_pager_sync(); + for (j = 0; j < n; ++j) { + vm_page_t mreq = m[i+j]; - /* - * get a swap pager clean data structure, block until we get it - */ - if (curproc == pageproc) { - if (swap_pager_free_count == 0) { - s = splvm(); - while (swap_pager_free_count == 0) { - swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT; - /* - * if it does not get one within a short time, then - * there is a potential deadlock, so we go-on trying - * to free pages. It is important to block here as opposed - * to returning, thereby allowing the pageout daemon to continue. - * It is likely that pageout daemon will start suboptimally - * reclaiming vnode backed pages if we don't block. Since the - * I/O subsystem is probably already fully utilized, might as - * well wait. - */ - if (tsleep(&swap_pager_needflags, PVM-1, "swpfre", hz/2)) { - if (swap_pager_free_pending) - swap_pager_sync(); - if (swap_pager_free_count == 0) { - for (i = firstidx; i < lastidx; i++) { - rtvals[i] = VM_PAGER_AGAIN; - } - splx(s); - return VM_PAGER_AGAIN; - } - } else { - swap_pager_sync(); - } - } - splx(s); + swp_pager_meta_build( + mreq->object, + mreq->pindex, + blk + j, + 0 + ); + mreq->dirty = VM_PAGE_BITS_ALL; + rtvals[i+j] = VM_PAGER_OK; + + vm_page_flag_set(mreq, PG_SWAPINPROG); + bp->b_pages[j] = mreq; } + bp->b_flags |= B_CALL; + bp->b_npages = n; - spc = TAILQ_FIRST(&swap_pager_free); - KASSERT(spc != NULL, - ("swap_pager_putpages: free queue is empty, %d expected\n", - swap_pager_free_count)); - TAILQ_REMOVE(&swap_pager_free, spc, spc_list); - swap_pager_free_count--; - - kva = spc->spc_kva; - bp = spc->spc_bp; - bzero(bp, sizeof *bp); - bp->b_spc = spc; - bp->b_xflags = 0; - bp->b_data = (caddr_t) kva; - } else { - spc = NULL; - bp = getpbuf(); - kva = (vm_offset_t) bp->b_data; - bp->b_spc = NULL; - } + cnt.v_swapout++; + cnt.v_swappgsout += bp->b_npages; + swapdev_vp->v_numoutput++; - /* - * map our page(s) into kva for I/O - */ - pmap_qenter(kva, &m[firstidx], ix); + /* + * asynchronous + * + * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY + */ + + if (sync == FALSE) { + bp->b_iodone = swp_pager_async_iodone; + bp->b_dirtyoff = 0; + bp->b_dirtyend = bp->b_bcount; + VOP_STRATEGY(bp->b_vp, bp); + + for (j = 0; j < n; ++j) + rtvals[i+j] = VM_PAGER_PEND; + + splx(s); + grv = VM_PAGER_PEND; + continue; + } - /* - * get the base I/O offset into the swap file - */ - for (i = firstidx; i < lastidx ; i++) { - fidx = m[i]->pindex + paging_pindex; - off = swap_pager_block_offset(fidx); /* - * set the valid bit + * synchronous + * + * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY */ - swb[i]->swb_valid |= (1 << off); + + bp->b_iodone = swp_pager_sync_iodone; + VOP_STRATEGY(bp->b_vp, bp); + /* - * and unlock the data structure + * Wait for the sync I/O to complete, then update rtvals. + * We just set the rtvals[] to VM_PAGER_PEND so we can call + * our async completion routine at the end, thus avoiding a + * double-free. */ - swb[i]->swb_locked--; - } + while ((bp->b_flags & B_DONE) == 0) { + tsleep(bp, PVM, "swwrt", 0); + } - bp->b_flags = B_BUSY | B_PAGING; - bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ - bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; - if (bp->b_rcred != NOCRED) - crhold(bp->b_rcred); - if (bp->b_wcred != NOCRED) - crhold(bp->b_wcred); - bp->b_blkno = reqaddr[firstidx]; - pbgetvp(swapdev_vp, bp); + if (bp->b_flags & B_ERROR) { + grv = VM_PAGER_ERROR; + } - bp->b_bcount = PAGE_SIZE * ix; - bp->b_bufsize = PAGE_SIZE * ix; + for (j = 0; j < n; ++j) + rtvals[i+j] = VM_PAGER_PEND; - s = splvm(); - swapdev_vp->v_numoutput++; + if (bp->b_flags & B_ERROR) { + grv = VM_PAGER_ERROR; + } - /* - * If this is an async write we set up additional buffer fields and - * place a "cleaning" entry on the inuse queue. - */ - object->un_pager.swp.swp_poip++; - - if (spc) { - spc->spc_flags = 0; - spc->spc_object = object; - bp->b_npages = ix; - for (i = firstidx; i < lastidx; i++) { - spc->spc_m[i] = m[i]; - bp->b_pages[i - firstidx] = m[i]; - vm_page_protect(m[i], VM_PROT_READ); - pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); - m[i]->dirty = 0; - } - spc->spc_first = firstidx; - spc->spc_count = ix; /* - * the completion routine for async writes + * Now that we are through with the bp, we can call the + * normal async completion, which frees everything up. */ - bp->b_flags |= B_CALL; - bp->b_iodone = swap_pager_iodone; - bp->b_dirtyoff = 0; - bp->b_dirtyend = bp->b_bcount; - TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list); - } else { - bp->b_flags |= B_CALL; - bp->b_iodone = swap_pager_iodone1; - bp->b_npages = ix; - for (i = firstidx; i < lastidx; i++) - bp->b_pages[i - firstidx] = m[i]; - } - cnt.v_swapout++; - cnt.v_swappgsout += ix; + swp_pager_async_iodone(bp); - /* - * perform the I/O - */ - VOP_STRATEGY(bp->b_vp, bp); - if (sync == FALSE) { - if (swap_pager_free_pending) { - swap_pager_sync(); - } - for (i = firstidx; i < lastidx; i++) { - rtvals[i] = VM_PAGER_PEND; - } splx(s); - return VM_PAGER_PEND; } + return(grv); +} + +/* + * swap_pager_sync_iodone: + * + * Completion routine for synchronous reads and writes from/to swap. + * We just mark the bp is complete and wake up anyone waiting on it. + * + * This routine may not block. + */ + +static void +swp_pager_sync_iodone(bp) + struct buf *bp; +{ + bp->b_flags |= B_DONE; + bp->b_flags &= ~B_ASYNC; + wakeup(bp); +} + +/* + * swp_pager_async_iodone: + * + * Completion routine for asynchronous reads and writes from/to swap. + * Also called manually by synchronous code to finish up a bp. + * + * WARNING! This routine may be called from an interrupt. We cannot + * mess with swap metadata unless we want to run all our other routines + * at splbio() too, which I'd rather not do. We up ourselves + * to splvm() because we may call vm_page_free(), which can unlink a + * page from an object. + * + * XXX currently I do not believe any object routines protect + * object->memq at splvm(). The code must be gone over to determine + * the actual state of the problem. + * + * For READ operations, the pages are PG_BUSY'd. For WRITE operations, + * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY + * unbusy all pages except the 'main' request page. For WRITE + * operations, we vm_page_t->busy'd unbusy all pages ( we can do this + * because we marked them all VM_PAGER_PEND on return from putpages ). + * + * This routine may not block. + * This routine is called at splbio() + */ + +static void +swp_pager_async_iodone(bp) + register struct buf *bp; +{ + int s; + int i; + vm_object_t object = NULL; + + s = splvm(); + + bp->b_flags |= B_DONE; + /* - * wait for the sync I/O to complete + * report error */ - while ((bp->b_flags & B_DONE) == 0) { - tsleep(bp, PVM, "swwrt", 0); - } if (bp->b_flags & B_ERROR) { printf( -"swap_pager: I/O error - pageout failed; blkno %ld, size %ld, error %d\n", - (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error); - rv = VM_PAGER_ERROR; - } else { - rv = VM_PAGER_OK; + "swap_pager: I/O error - %s failed; blkno %ld," + "size %ld, error %d\n", + ((bp->b_flags & B_READ) ? "pagein" : "pageout"), + (long)bp->b_blkno, + (long)bp->b_bcount, + bp->b_error + ); } - object->un_pager.swp.swp_poip--; - if (object->un_pager.swp.swp_poip == 0) - wakeup(object); - - if (bp->b_vp) - pbrelvp(bp); + /* + * set object. + */ - splx(s); + if (bp->b_npages) + object = bp->b_pages[0]->object; /* * remove the mapping for kernel virtual */ - pmap_qremove(kva, ix); + + pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); /* - * if we have written the page, then indicate that the page is clean. + * cleanup pages. If an error occurs writing to swap, we are in + * very serious trouble. If it happens to be a disk error, though, + * we may be able to recover by reassigning the swap later on. So + * in this case we remove the m->swapblk assignment for the page + * but do not free it in the rlist. The errornous block(s) are thus + * never reallocated as swap. Redirty the page and continue. */ - if (rv == VM_PAGER_OK) { - for (i = firstidx; i < lastidx; i++) { - if (rtvals[i] == VM_PAGER_OK) { - pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); - m[i]->dirty = 0; + + for (i = 0; i < bp->b_npages; ++i) { + vm_page_t m = bp->b_pages[i]; + + vm_page_flag_clear(m, PG_SWAPINPROG); + + if (bp->b_flags & B_ERROR) { + /* + * If an error occurs I'd love to throw the swapblk + * away without freeing it back to swapspace, so it + * can never be used again. But I can't from an + * interrupt. + */ + + if (bp->b_flags & B_READ) { /* - * optimization, if a page has been read - * during the pageout process, we activate it. + * When reading, reqpage needs to stay + * locked for the parent, but all other + * pages can be freed. We still want to + * wakeup the parent waiting on the page, + * though. ( also: pg_reqpage can be -1 and + * not match anything ). + * + * We have to wake specifically requested pages + * up too because we cleared PG_SWAPINPROG and + * someone may be waiting for that. + * + * NOTE: for reads, m->dirty will probably + * be overriden by the original caller of + * getpages so don't play cute tricks here. + * + * XXX it may not be legal to free the page + * here as this messes with the object->memq's. */ - if (((m[i]->flags & (PG_WANTED|PG_REFERENCED)) || - pmap_ts_referenced(VM_PAGE_TO_PHYS(m[i])))) { - vm_page_activate(m[i]); - } + + m->valid = 0; + vm_page_flag_clear(m, PG_ZERO); + + if (i != bp->b_pager.pg_reqpage) + vm_page_free(m); + else + vm_page_flash(m); + /* + * If i == bp->b_pager.pg_reqpage, do not wake + * the page up. The caller needs to. + */ + } else { + /* + * If a write error occurs, reactivate page + * so it doesn't clog the inactive list, + * then finish the I/O. + */ + m->dirty = VM_PAGE_BITS_ALL; + vm_page_activate(m); + vm_page_io_finish(m); } - } - } else { - for (i = firstidx; i < lastidx; i++) { - rtvals[i] = rv; + } else if (bp->b_flags & B_READ) { + /* + * For read success, clear dirty bits. Nobody should + * have this page mapped but don't take any chances, + * make sure the pmap modify bits are also cleared. + * + * NOTE: for reads, m->dirty will probably be + * overriden by the original caller of getpages so + * we cannot set them in order to free the underlying + * swap in a low-swap situation. I don't think we'd + * want to do that anyway, but it was an optimization + * that existed in the old swapper for a time before + * it got ripped out due to precisely this problem. + * + * clear PG_ZERO in page. + * + * If not the requested page then deactivate it. + * + * Note that the requested page, reqpage, is left + * busied, but we still have to wake it up. The + * other pages are released (unbusied) by + * vm_page_wakeup(). We do not set reqpage's + * valid bits here, it is up to the caller. + */ + + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + m->valid = VM_PAGE_BITS_ALL; + m->dirty = 0; + vm_page_flag_clear(m, PG_ZERO); + + /* + * We have to wake specifically requested pages + * up too because we cleared PG_SWAPINPROG and + * could be waiting for it in getpages. However, + * be sure to not unbusy getpages specifically + * requested page - getpages expects it to be + * left busy. + */ + if (i != bp->b_pager.pg_reqpage) { + vm_page_deactivate(m); + vm_page_wakeup(m); + } else { + vm_page_flash(m); + } + } else { + /* + * For write success, clear the modify and dirty + * status, then finish the I/O ( which decrements the + * busy count and possibly wakes waiter's up ). + */ + vm_page_protect(m, VM_PROT_READ); + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + m->dirty = 0; + vm_page_io_finish(m); } } - if (spc != NULL) { - if (bp->b_rcred != NOCRED) - crfree(bp->b_rcred); - if (bp->b_wcred != NOCRED) - crfree(bp->b_wcred); - spc_free(spc); - } else - relpbuf(bp); - if (swap_pager_free_pending) - swap_pager_sync(); - - return (rv); + /* + * adjust pip. NOTE: the original parent may still have its own + * pip refs on the object. + */ + + if (object) + vm_object_pip_wakeupn(object, bp->b_npages); + + /* + * release the physical I/O buffer + */ + + relpbuf(bp, ((bp->b_flags & B_READ) ? &nsw_rcount : &nsw_wcount)); + + splx(s); } -void -swap_pager_sync() +/************************************************************************ + * SWAP META DATA * + ************************************************************************ + * + * These routines manipulate the swap metadata stored in the + * OBJT_SWAP object. + * + * In fact, we just have a few counters in the vm_object_t. The + * metadata is actually stored in a hash table. + */ + +/* + * SWP_PAGER_HASH() - hash swap meta data + * + * This is an inline helper function which hash the swapblk given + * the object and page index. It returns a pointer to a pointer + * to the object, or a pointer to a NULL pointer if it could not + * find a swapblk. + */ + +static __inline struct swblock ** +swp_pager_hash(vm_object_t object, daddr_t index) { - swp_clean_t spc; + struct swblock **pswap; + struct swblock *swap; + + index &= ~SWAP_META_MASK; + pswap = &swhash[(index ^ (int)(long)object) & swhash_mask]; - while (spc = TAILQ_FIRST(&swap_pager_done)) { - swap_pager_finish(spc); + while ((swap = *pswap) != NULL) { + if (swap->swb_object == object && + swap->swb_index == index + ) { + break; + } + pswap = &swap->swb_hnext; } - return; + return(pswap); } +/* + * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object + * + * We first convert the object to a swap object if it is a default + * object. + * + * The specified swapblk is added to the object's swap metadata. If + * the swapblk is not valid, it is freed instead. Any previously + * assigned swapblk is freed. + */ + static void -swap_pager_finish(spc) - register swp_clean_t spc; -{ - int i, s, lastidx; - vm_object_t object; - vm_page_t *ma; +swp_pager_meta_build( + vm_object_t object, + daddr_t index, + daddr_t swapblk, + int waitok +) { + struct swblock *swap; + struct swblock **pswap; - ma = spc->spc_m; - object = spc->spc_object; - lastidx = spc->spc_first + spc->spc_count; + /* + * Convert default object to swap object if necessary + */ - s = splvm(); - TAILQ_REMOVE(&swap_pager_done, spc, spc_list); - splx(s); + if (object->type != OBJT_SWAP) { + object->type = OBJT_SWAP; + object->un_pager.swp.swp_bcount = 0; + + if (object->handle != NULL) { + TAILQ_INSERT_TAIL( + NOBJLIST(object->handle), + object, + pager_object_list + ); + } else { + TAILQ_INSERT_TAIL( + &swap_pager_un_object_list, + object, + pager_object_list + ); + } + } + + /* + * Wait for free memory when waitok is TRUE prior to calling the + * zone allocator. + */ - pmap_qremove(spc->spc_kva, spc->spc_count); + while (waitok && cnt.v_free_count == 0) { + VM_WAIT; + } /* - * If no error, mark as clean and inform the pmap system. If error, - * mark as dirty so we will try again. (XXX could get stuck doing - * this, should give up after awhile) + * If swapblk being added is invalid, just free it. */ - if (spc->spc_flags & SPC_ERROR) { - for (i = spc->spc_first; i < lastidx; i++) { - printf("swap_pager_finish: I/O error, clean of page %lx failed\n", - (u_long) VM_PAGE_TO_PHYS(ma[i])); - ma[i]->dirty = VM_PAGE_BITS_ALL; - vm_page_io_finish(ma[i]); + if (swapblk & SWAPBLK_NONE) { + if (swapblk != SWAPBLK_NONE) { + swp_pager_freeswapspace( + index, + 1 + ); + swapblk = SWAPBLK_NONE; } + } - vm_object_pip_subtract(object, spc->spc_count); - if ((object->paging_in_progress == 0) && - (object->flags & OBJ_PIPWNT)) { - vm_object_clear_flag(object, OBJ_PIPWNT); - wakeup(object); - } + /* + * Locate hash entry. If not found create, but if we aren't adding + * anything just return. + */ - } else { - for (i = spc->spc_first; i < lastidx; i++) { - if ((ma[i]->queue != PQ_ACTIVE) && - ((ma[i]->flags & PG_WANTED) || - pmap_ts_referenced(VM_PAGE_TO_PHYS(ma[i])))) { - vm_page_activate(ma[i]); - } - } + pswap = swp_pager_hash(object, index); + + if ((swap = *pswap) == NULL) { + int i; + + if (swapblk == SWAPBLK_NONE) + return; + + swap = *pswap = zalloc(swap_zone); + + swap->swb_hnext = NULL; + swap->swb_object = object; + swap->swb_index = index & ~SWAP_META_MASK; + swap->swb_count = 0; + + ++object->un_pager.swp.swp_bcount; + + for (i = 0; i < SWAP_META_PAGES; ++i) + swap->swb_pages[i] = SWAPBLK_NONE; } - nswiodone -= spc->spc_count; - swap_pager_free_pending--; - spc_free(spc); + /* + * Delete prior contents of metadata + */ - return; + index &= SWAP_META_MASK; + + if (swap->swb_pages[index] != SWAPBLK_NONE) { + swp_pager_freeswapspace( + swap->swb_pages[index] & SWAPBLK_MASK, + 1 + ); + --swap->swb_count; + } + + /* + * Enter block into metadata + */ + + swap->swb_pages[index] = swapblk; + ++swap->swb_count; } /* - * swap_pager_iodone + * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata + * + * The requested range of blocks is freed, with any associated swap + * returned to the swap bitmap. + * + * This routine will free swap metadata structures as they are cleaned + * out. This routine does *NOT* operate on swap metadata associated + * with resident pages. + * + * This routine must be called at splvm() */ + static void -swap_pager_iodone(bp) - register struct buf *bp; +swp_pager_meta_free(vm_object_t object, daddr_t index, daddr_t count) { - int i, s, lastidx; - register swp_clean_t spc; - vm_object_t object; - vm_page_t *ma; + if (object->type != OBJT_SWAP) + return; + while (count > 0) { + struct swblock **pswap; + struct swblock *swap; - s = splvm(); - spc = (swp_clean_t) bp->b_spc; - TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); - TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list); + pswap = swp_pager_hash(object, index); - object = spc->spc_object; + if ((swap = *pswap) != NULL) { + daddr_t v = swap->swb_pages[index & SWAP_META_MASK]; -#if defined(DIAGNOSTIC) - if (object->paging_in_progress < spc->spc_count) - printf("swap_pager_iodone: paging_in_progress(%d) < spc_count(%d)\n", - object->paging_in_progress, spc->spc_count); -#endif - - if (bp->b_flags & B_ERROR) { - spc->spc_flags |= SPC_ERROR; - printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n", - (bp->b_flags & B_READ) ? "pagein" : "pageout", - (u_long) bp->b_blkno, bp->b_bcount, bp->b_error); - } else { - vm_object_pip_subtract(object, spc->spc_count); - if ((object->paging_in_progress == 0) && - (object->flags & OBJ_PIPWNT)) { - vm_object_clear_flag(object, OBJ_PIPWNT); - wakeup(object); - } - ma = spc->spc_m; - lastidx = spc->spc_first + spc->spc_count; - for (i = spc->spc_first; i < lastidx; i++) { - /* - * we wakeup any processes that are waiting on these pages. - */ - vm_page_io_finish(ma[i]); + if (v != SWAPBLK_NONE) { + swp_pager_freeswapspace(v, 1); + swap->swb_pages[index & SWAP_META_MASK] = + SWAPBLK_NONE; + if (--swap->swb_count == 0) { + *pswap = swap->swb_hnext; + zfree(swap_zone, swap); + --object->un_pager.swp.swp_bcount; + } + } + --count; + ++index; + } else { + daddr_t n = SWAP_META_PAGES - (index & SWAP_META_MASK); + count -= n; + index += n; } } +} + +/* + * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object + * + * This routine locates and destroys all swap metadata associated with + * an object. + */ + +static void +swp_pager_meta_free_all(vm_object_t object) +{ + daddr_t index = 0; - if (bp->b_vp) - pbrelvp(bp); + if (object->type != OBJT_SWAP) + return; - if (bp->b_rcred != NOCRED) - crfree(bp->b_rcred); - if (bp->b_wcred != NOCRED) - crfree(bp->b_wcred); + while (object->un_pager.swp.swp_bcount) { + struct swblock **pswap; + struct swblock *swap; - nswiodone += spc->spc_count; - swap_pager_free_pending++; - if (--spc->spc_object->un_pager.swp.swp_poip == 0) { - wakeup(spc->spc_object); - } + pswap = swp_pager_hash(object, index); + if ((swap = *pswap) != NULL) { + int i; - if (swap_pager_needflags && - ((swap_pager_free_count + swap_pager_free_pending) > (npendingio / 2))) { - spc_wakeup(); + for (i = 0; i < SWAP_META_PAGES; ++i) { + daddr_t v = swap->swb_pages[i]; + if (v != SWAPBLK_NONE) { +#if !defined(MAX_PERF) + --swap->swb_count; +#endif + swp_pager_freeswapspace( + v, + 1 + ); + } + } +#if !defined(MAX_PERF) + if (swap->swb_count != 0) + panic("swap_pager_meta_free_all: swb_count != 0"); +#endif + *pswap = swap->swb_hnext; + zfree(swap_zone, swap); + --object->un_pager.swp.swp_bcount; + } + index += SWAP_META_PAGES; +#if !defined(MAX_PERF) + if (index > 0x20000000) + panic("swp_pager_meta_free_all: failed to locate all swap meta blocks"); +#endif } +} - if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) && - vm_pageout_pages_needed) { - wakeup(&vm_pageout_pages_needed); - vm_pageout_pages_needed = 0; +/* + * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. + * + * This routine is capable of looking up, popping, or freeing + * swapblk assignments in the swap meta data or in the vm_page_t. + * The routine typically returns the swapblk being looked-up, or popped, + * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block + * was invalid. This routine will automatically free any invalid + * meta-data swapblks. + * + * It is not possible to store invalid swapblks in the swap meta data + * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. + * + * When acting on a busy resident page and paging is in progress, we + * have to wait until paging is complete but otherwise can act on the + * busy page. + * + * SWM_FREE remove and free swap block from metadata + * + * SWM_POP remove from meta data but do not free.. pop it out + */ + +static daddr_t +swp_pager_meta_ctl( + vm_object_t object, + vm_pindex_t index, + int flags +) { + /* + * The meta data only exists of the object is OBJT_SWAP + * and even then might not be allocated yet. + */ + + if ( + object->type != OBJT_SWAP || + object->un_pager.swp.swp_bcount == 0 + ) { + return(SWAPBLK_NONE); } - splx(s); + { + struct swblock **pswap; + struct swblock *swap; + daddr_t r1 = SWAPBLK_NONE; + + pswap = swp_pager_hash(object, index); + + index &= SWAP_META_MASK; + + if ((swap = *pswap) != NULL) { + r1 = swap->swb_pages[index]; + + if (r1 != SWAPBLK_NONE) { + if (flags & SWM_FREE) { + swp_pager_freeswapspace( + r1, + 1 + ); + r1 = SWAPBLK_NONE; + } + if (flags & (SWM_FREE|SWM_POP)) { + swap->swb_pages[index] = SWAPBLK_NONE; + if (--swap->swb_count == 0) { + *pswap = swap->swb_hnext; + zfree(swap_zone, swap); + --object->un_pager.swp.swp_bcount; + } + } + } + } + + return(r1); + } + /* not reached */ } + diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index ceb88b6..374223c 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90 - * $Id: swap_pager.h,v 1.21 1998/04/29 04:28:02 dyson Exp $ + * $Id: swap_pager.h,v 1.22 1998/07/10 21:50:17 alex Exp $ */ /* @@ -59,26 +59,50 @@ #define SWB_NPAGES 8 #endif +/* + * Piecemeal swap metadata structure. Swap is stored in a radix tree. + * + * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix + * is basically 8. Assuming PAGE_SIZE == 4096, one tree level represents + * 32K worth of data, two levels represent 256K, three levels represent + * 2 MBytes. This is acceptable. + * + * Overall memory utilization is about the same as the old swap structure. + */ + +#define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t)) + +#define SWAP_META_PAGES (SWB_NPAGES * 2) +#define SWAP_META_MASK (SWAP_META_PAGES - 1) + struct swblock { - unsigned short swb_valid; /* bitmask for valid pages */ - unsigned short swb_locked; /* block locked */ - daddr_t swb_block[SWB_NPAGES]; + struct swblock *swb_hnext; + vm_object_t swb_object; + int swb_index; + int swb_count; + daddr_t swb_pages[SWAP_META_PAGES]; }; -typedef struct swblock *sw_blk_t; #ifdef KERNEL extern struct pagerlst swap_pager_un_object_list; extern int swap_pager_full; -extern struct rlisthdr swaplist; +extern struct blist *swapblist; + +int swap_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); +boolean_t swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex, int *before, int *after)); -int swap_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); int swap_pager_swp_alloc __P((vm_object_t, int)); -void swap_pager_copy __P((vm_object_t, vm_pindex_t, vm_object_t, - vm_pindex_t, vm_pindex_t, int)); +void swap_pager_copy __P((vm_object_t, vm_object_t, vm_pindex_t, int)); void swap_pager_freespace __P((vm_object_t, vm_pindex_t, vm_size_t)); void swap_pager_dmzspace __P((vm_object_t, vm_pindex_t, vm_size_t)); void swap_pager_swap_init __P((void)); -void swap_pager_sync __P((void)); + +/* + * newswap functions + */ + +void swap_pager_page_removed __P((vm_page_t, vm_object_t)); + #endif #endif /* _SWAP_PAGER_ */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index e3d64f9..d0f4754 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -66,7 +66,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_fault.c,v 1.92 1999/01/08 17:31:24 eivind Exp $ + * $Id: vm_fault.c,v 1.93 1999/01/10 01:58:28 eivind Exp $ */ /* @@ -114,7 +114,7 @@ struct faultstate { struct vnode *vp; }; -static void +static __inline void release_page(struct faultstate *fs) { vm_page_wakeup(fs->m); @@ -122,7 +122,7 @@ release_page(struct faultstate *fs) fs->m = NULL; } -static void +static __inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { @@ -263,36 +263,43 @@ RetryFault:; fs.object = fs.first_object; fs.pindex = fs.first_pindex; - /* - * See whether this page is resident - */ while (TRUE) { + /* + * If the object is dead, we stop here + */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } + + /* + * See if page is resident + */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { int queue, s; /* - * If the page is being brought in, wait for it and - * then retry. + * Wait/Retry if the page is busy. We have to do this + * if the page is busy via either PG_BUSY or + * vm_page_t->busy because the vm_pager may be using + * vm_page_t->busy for pageouts ( and even pageins if + * it is the vnode pager ), and we could end up trying + * to pagein and pageout the same page simultaniously. + * + * We can theoretically allow the busy case on a read + * fault if the page is marked valid, but since such + * pages are typically already pmap'd, putting that + * special case in might be more effort then it is + * worth. We cannot under any circumstances mess + * around with a vm_page_t->busy page except, perhaps, + * to pmap it. */ - if ((fs.m->flags & PG_BUSY) || - (fs.m->busy && - (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) { + if ((fs.m->flags & PG_BUSY) || fs.m->busy) { unlock_things(&fs); - s = splvm(); - if ((fs.m->flags & PG_BUSY) || - (fs.m->busy && - (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) { - vm_page_flag_set(fs.m, PG_WANTED | PG_REFERENCED); - cnt.v_intrans++; - tsleep(fs.m, PSWP, "vmpfw", 0); - } - splx(s); + (void)vm_page_sleep_busy(fs.m, TRUE, "vmpfw"); + cnt.v_intrans++; vm_object_deallocate(fs.first_object); goto RetryFault; } @@ -302,8 +309,12 @@ RetryFault:; vm_page_unqueue_nowakeup(fs.m); splx(s); +#if 0 /* - * Mark page busy for other processes, and the pagedaemon. + * Code removed. In a low-memory situation (say, a + * memory-bound program is running), the last thing you + * do is starve reactivations for other processes. + * XXX we need to find a better way. */ if (((queue - fs.m->pc) == PQ_CACHE) && (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { @@ -312,6 +323,13 @@ RetryFault:; VM_WAIT; goto RetryFault; } +#endif + /* + * Mark page busy for other processes, and the + * pagedaemon. If it still isn't completely valid + * (readable), jump to readrest, else break-out ( we + * found the page ). + */ vm_page_busy(fs.m); if (((fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) && @@ -321,6 +339,12 @@ RetryFault:; break; } + + /* + * Page is not resident, If this is the search termination, + * allocate a new page. + */ + if (((fs.object->type != OBJT_DEFAULT) && (((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired)) || (fs.object == fs.first_object)) { @@ -344,6 +368,13 @@ RetryFault:; } readrest: + /* + * Have page, but it may not be entirely valid ( or valid at + * all ). If this object is not the default, try to fault-in + * the page as well as activate additional pages when + * appropriate, and page-in additional pages when appropriate. + */ + if (fs.object->type != OBJT_DEFAULT && (((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired)) { int rv; @@ -410,13 +441,16 @@ readrest: * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. + * + * fs.m plus the additional pages are PG_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); /* * Call the pager to retrieve the data, if any, after - * releasing the lock on the map. + * releasing the lock on the map. We hold a ref on + * fs.object and the pages are PG_BUSY'd. */ unlock_map(&fs); @@ -442,7 +476,7 @@ readrest: } hardfault++; - break; + break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this @@ -486,8 +520,8 @@ readrest: } } /* - * We get here if the object has default pager (or unwiring) or the - * pager doesn't have the page. + * We get here if the object has default pager (or unwiring) + * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; @@ -518,15 +552,17 @@ readrest: cnt.v_ozfod++; } cnt.v_zfod++; - break; + break; /* break to PAGE HAS BEEN FOUND */ } else { if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); } + KASSERT(fs.object != next_object, ("object loop %p", next_object)); fs.object = next_object; vm_object_pip_add(fs.object, 1); } } + KASSERT((fs.m->flags & PG_BUSY) != 0, ("vm_fault: not busy after main loop")); @@ -549,14 +585,15 @@ readrest: */ if (fault_type & VM_PROT_WRITE) { - /* - * This allows pages to be virtually copied from a backing_object - * into the first_object, where the backing object has no other - * refs to it, and cannot gain any more refs. Instead of a - * bcopy, we just move the page from the backing object to the - * first object. Note that we must mark the page dirty in the - * first object so that it will go out to swap when needed. + * This allows pages to be virtually copied from a + * backing_object into the first_object, where the + * backing object has no other refs to it, and cannot + * gain any more refs. Instead of a bcopy, we just + * move the page from the backing object to the + * first object. Note that we must mark the page + * dirty in the first object so that it will go out + * to swap when needed. */ if (map_generation == fs.map->timestamp && /* @@ -598,11 +635,12 @@ readrest: fs.first_m = NULL; /* - * grab the page and put it into the process'es object + * grab the page and put it into the + * process'es object. The page is + * automatically made dirty. */ vm_page_rename(fs.m, fs.first_object, fs.first_pindex); fs.first_m = fs.m; - fs.first_m->dirty = VM_PAGE_BITS_ALL; vm_page_busy(fs.first_m); fs.m = NULL; cnt.v_cow_optim++; @@ -620,7 +658,13 @@ readrest: release_page(&fs); } + /* + * fs.object != fs.first_object due to above + * conditional + */ + vm_object_pip_wakeup(fs.object); + /* * Only use the new page below... */ @@ -708,9 +752,13 @@ readrest: * If the fault is a write, we know that this page is being * written NOW. This will save on the pmap_is_modified() calls * later. + * + * Also tell the backing pager, if any, that it should remove + * any swap backing since the page is now dirty. */ if (fault_flags & VM_FAULT_DIRTY) { fs.m->dirty = VM_PAGE_BITS_ALL; + vm_pager_page_unswapped(fs.m); } } @@ -1021,8 +1069,7 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) * if the requested page is not available, then give up now */ - if (!vm_pager_has_page(object, - OFF_TO_IDX(object->paging_offset) + pindex, &cbehind, &cahead)) { + if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) { return 0; } diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index ec844db..0a3309d 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -59,7 +59,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_glue.c,v 1.79 1998/12/19 08:23:31 julian Exp $ + * $Id: vm_glue.c,v 1.80 1999/01/07 21:23:50 julian Exp $ */ #include "opt_rlimit.h" @@ -213,10 +213,19 @@ vm_fork(p1, p2, flags) p1->p_vmspace->vm_refcnt++; } + /* + * Great, so we have a memory-heavy process and the + * entire machine comes to a screaching halt because + * nobody can fork/exec anything. What we really need + * to do is fix the process swapper so it swaps out the right + * processes. + */ +#if 0 while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { vm_pageout_deficit += (UPAGES + VM_INITIAL_PAGEIN); VM_WAIT; } +#endif if ((flags & RFMEM) == 0) { p2->p_vmspace = vmspace_fork(p1->p_vmspace); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index ea7f45b..b2e1102 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_kern.c,v 1.49 1998/08/24 08:39:37 dfr Exp $ + * $Id: vm_kern.c,v 1.50 1998/09/04 08:06:57 dfr Exp $ */ /* @@ -181,8 +181,9 @@ kmem_alloc(map, size) VM_ALLOC_ZERO | VM_ALLOC_RETRY); if ((mem->flags & PG_ZERO) == 0) vm_page_zero_fill(mem); - vm_page_flag_clear(mem, (PG_BUSY | PG_ZERO)); mem->valid = VM_PAGE_BITS_ALL; + vm_page_flag_clear(mem, PG_ZERO); + vm_page_wakeup(mem); } /* @@ -200,6 +201,8 @@ kmem_alloc(map, size) * Release a region of kernel virtual memory allocated * with kmem_alloc, and return the physical pages * associated with that region. + * + * This routine may not block on kernel maps. */ void kmem_free(map, addr, size) @@ -252,26 +255,31 @@ kmem_suballoc(parent, min, max, size) } /* - * Allocate wired-down memory in the kernel's address map for the higher - * level kernel memory allocator (kern/kern_malloc.c). We cannot use - * kmem_alloc() because we may need to allocate memory at interrupt - * level where we cannot block (canwait == FALSE). + * kmem_malloc: + * + * Allocate wired-down memory in the kernel's address map for the higher + * level kernel memory allocator (kern/kern_malloc.c). We cannot use + * kmem_alloc() because we may need to allocate memory at interrupt + * level where we cannot block (canwait == FALSE). + * + * This routine has its own private kernel submap (kmem_map) and object + * (kmem_object). This, combined with the fact that only malloc uses + * this routine, ensures that we will never block in map or object waits. * - * This routine has its own private kernel submap (kmem_map) and object - * (kmem_object). This, combined with the fact that only malloc uses - * this routine, ensures that we will never block in map or object waits. + * Note that this still only works in a uni-processor environment and + * when called at splhigh(). * - * Note that this still only works in a uni-processor environment and - * when called at splhigh(). + * We don't worry about expanding the map (adding entries) since entries + * for wired maps are statically allocated. * - * We don't worry about expanding the map (adding entries) since entries - * for wired maps are statically allocated. + * NOTE: This routine is not supposed to block if M_NOWAIT is set, but + * I have not verified that it actually does not block. */ vm_offset_t -kmem_malloc(map, size, waitflag) +kmem_malloc(map, size, flags) register vm_map_t map; register vm_size_t size; - boolean_t waitflag; + int flags; { register vm_offset_t offset, i; vm_map_entry_t entry; @@ -297,7 +305,7 @@ kmem_malloc(map, size, waitflag) printf("Out of mbuf clusters - adjust NMBCLUSTERS or increase maxusers!\n"); return (0); } - if (waitflag == M_WAITOK) + if ((flags & M_NOWAIT) == 0) panic("kmem_malloc(%d): kmem_map too small: %d total allocated", size, map->size); return (0); @@ -308,9 +316,19 @@ kmem_malloc(map, size, waitflag) VM_PROT_ALL, VM_PROT_ALL, 0); for (i = 0; i < size; i += PAGE_SIZE) { + /* + * Note: if M_NOWAIT specified alone, allocate from + * interrupt-safe queues only (just the free list). If + * M_ASLEEP or M_USE_RESERVE is also specified, we can also + * allocate from the cache. Neither of the latter two + * flags may be specified from an interrupt since interrupts + * are not allowed to mess with the cache queue. + */ retry: m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), - (waitflag == M_NOWAIT) ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM); + ((flags & (M_NOWAIT|M_ASLEEP|M_USE_RESERVE)) == M_NOWAIT) ? + VM_ALLOC_INTERRUPT : + VM_ALLOC_SYSTEM); /* * Ran out of space, free everything up and return. Don't need @@ -318,7 +336,7 @@ retry: * aren't on any queues. */ if (m == NULL) { - if (waitflag == M_WAITOK) { + if ((flags & M_NOWAIT) == 0) { VM_WAIT; goto retry; } @@ -330,6 +348,9 @@ retry: } vm_map_delete(map, addr, addr + size); vm_map_unlock(map); + if (flags & M_ASLEEP) { + VM_AWAIT; + } return (0); } vm_page_flag_clear(m, PG_ZERO); @@ -359,6 +380,9 @@ retry: m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i)); vm_page_wire(m); vm_page_wakeup(m); + /* + * Because this is kernel_pmap, this call will not block. + */ pmap_enter(kernel_pmap, addr + i, VM_PAGE_TO_PHYS(m), VM_PROT_ALL, 1); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_REFERENCED); @@ -369,12 +393,14 @@ retry: } /* - * kmem_alloc_wait + * kmem_alloc_wait: * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * + * This routine may block. */ + vm_offset_t kmem_alloc_wait(map, size) vm_map_t map; @@ -406,7 +432,7 @@ kmem_alloc_wait(map, size) } /* - * kmem_free_wakeup + * kmem_free_wakeup: * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. @@ -424,11 +450,14 @@ kmem_free_wakeup(map, addr, size) } /* - * Create the kernel map; insert a mapping covering kernel text, data, bss, - * and all space allocated thus far (`boostrap' data). The new map will thus - * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and - * the range between `start' and `end' as free. + * kmem_init: + * + * Create the kernel map; insert a mapping covering kernel text, + * data, bss, and all space allocated thus far (`boostrap' data). The + * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and + * `start' as allocated, and the range between `start' and `end' as free. */ + void kmem_init(start, end) vm_offset_t start, end; @@ -445,3 +474,4 @@ kmem_init(start, end) /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); } + diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 829548a..f495788 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_map.c,v 1.138 1998/10/25 17:44:58 phk Exp $ + * $Id: vm_map.c,v 1.139 1999/01/06 23:05:41 julian Exp $ */ /* @@ -440,7 +440,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_map_entry_t new_entry; vm_map_entry_t prev_entry; vm_map_entry_t temp_entry; +#if 0 vm_object_t prev_object; +#endif u_char protoeflags; if ((object != NULL) && (cow & MAP_NOFAULT)) { @@ -514,10 +516,15 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, map->size += (end - prev_entry->end); prev_entry->end = end; +#if 0 + /* + * (no longer applies) + */ if ((cow & MAP_NOFAULT) == 0) { prev_object = prev_entry->object.vm_object; default_pager_convert_to_swapq(prev_object); } +#endif return (KERN_SUCCESS); } else { @@ -573,7 +580,12 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, (prev_entry->end >= new_entry->start)) map->first_free = new_entry; +#if 0 + /* + * (no longer applies) + */ default_pager_convert_to_swapq(object); +#endif return (KERN_SUCCESS); } @@ -1504,7 +1516,12 @@ vm_map_user_pageable(map, start, end, new_pageable) entry->offset = (vm_offset_t) 0; } +#if 0 + /* + * (no longer applies) + */ default_pager_convert_to_swapq(entry->object.vm_object); +#endif } vm_map_clip_start(map, entry, start); @@ -1695,7 +1712,12 @@ vm_map_pageable(map, start, end, new_pageable) atop(entry->end - entry->start)); entry->offset = (vm_offset_t) 0; } +#if 0 + /* + * (no longer applies) + */ default_pager_convert_to_swapq(entry->object.vm_object); +#endif } } vm_map_clip_start(map, entry, start); @@ -2192,16 +2214,18 @@ vm_map_split(entry) m = vm_page_lookup(orig_object, offidxstart + idx); if (m == NULL) continue; - if (m->flags & PG_BUSY) { - vm_page_flag_set(m, PG_WANTED); - tsleep(m, PVM, "spltwt", 0); + + /* + * We must wait for pending I/O to complete before we can + * rename the page. + */ + if (vm_page_sleep_busy(m, TRUE, "spltwt")) goto retry; - } vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_rename(m, new_object, idx); - m->dirty = VM_PAGE_BITS_ALL; + /* page automatically made dirty by rename */ vm_page_busy(m); } @@ -2212,9 +2236,7 @@ vm_map_split(entry) * and destroy unneeded pages in * shadow object. */ - swap_pager_copy(orig_object, OFF_TO_IDX(orig_object->paging_offset), - new_object, OFF_TO_IDX(new_object->paging_offset), - offidxstart, 0); + swap_pager_copy(orig_object, new_object, offidxstart, 0); vm_object_pip_wakeup(orig_object); } @@ -2670,8 +2692,13 @@ RetryLookup:; vm_map_lock_downgrade(share_map); } +#if 0 + /* + * (no longer applies) + */ if (entry->object.vm_object->type == OBJT_DEFAULT) default_pager_convert_to_swapq(entry->object.vm_object); +#endif /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. @@ -2781,6 +2808,10 @@ vm_uiomove(mapa, srcobject, cp, cnta, uaddra, npages) vm_map_lookup_done(map, entry); return 0; } + /* + * disallow busy or invalid pages, but allow + * m->busy pages if they are entirely valid. + */ if ((m->flags & PG_BUSY) || ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) { vm_map_lookup_done(map, entry); @@ -2856,7 +2887,7 @@ vm_uiomove(mapa, srcobject, cp, cnta, uaddra, npages) */ if (first_object->type == OBJT_SWAP) { swap_pager_freespace(first_object, - OFF_TO_IDX(first_object->paging_offset), + 0, first_object->size); } diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 5bc74bd..bb52f66 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 - * $Id: vm_meter.c,v 1.26 1998/08/24 08:39:37 dfr Exp $ + * $Id: vm_meter.c,v 1.27 1998/10/31 17:21:31 peter Exp $ */ #include <sys/param.h> @@ -195,6 +195,11 @@ vmtotal SYSCTL_HANDLER_ARGS for (object = TAILQ_FIRST(&vm_object_list); object != NULL; object = TAILQ_NEXT(object, object_list)) { + /* + * devices, like /dev/mem, will badly skew our totals + */ + if (object->type == OBJT_DEVICE) + continue; totalp->t_vm += object->size; totalp->t_rm += object->resident_page_count; if (object->flags & OBJ_ACTIVE) { diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index ba36e41..1374dfb 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -38,7 +38,7 @@ * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 - * $Id: vm_mmap.c,v 1.85 1998/12/09 20:22:21 dt Exp $ + * $Id: vm_mmap.c,v 1.86 1999/01/06 23:05:42 julian Exp $ */ /* @@ -71,6 +71,7 @@ #include <vm/pmap.h> #include <vm/vm_map.h> #include <vm/vm_object.h> +#include <vm/vm_page.h> #include <vm/vm_pager.h> #include <vm/vm_pageout.h> #include <vm/vm_extern.h> diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index a1477f2..86c71c8 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_object.c,v 1.137 1999/01/08 17:31:26 eivind Exp $ + * $Id: vm_object.c,v 1.138 1999/01/10 01:58:28 eivind Exp $ */ /* @@ -134,9 +134,12 @@ static long object_bypasses; static int next_index; static vm_zone_t obj_zone; static struct vm_zone obj_zone_store; +static int object_hash_rand; #define VM_OBJECTS_INIT 256 static struct vm_object vm_objects_init[VM_OBJECTS_INIT]; +#if 0 static int objidnumber; +#endif void _vm_object_allocate(type, size, object) @@ -152,7 +155,9 @@ _vm_object_allocate(type, size, object) object->size = size; object->ref_count = 1; object->flags = 0; +#if 0 object->id = ++objidnumber; +#endif if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) vm_object_set_flag(object, OBJ_ONEMAPPING); object->behavior = OBJ_NORMAL; @@ -168,16 +173,25 @@ _vm_object_allocate(type, size, object) incr = size; next_index = (next_index + incr) & PQ_L2_MASK; object->handle = NULL; - object->paging_offset = (vm_ooffset_t) 0; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; +#if 0 object->page_hint = NULL; +#endif + /* + * Try to generate a number that will spread objects out in the + * hash table. We 'wipe' new objects across the hash in 128 page + * increments plus 1 more to offset it a little more by the time + * it wraps around. + */ + object->hash_rand = object_hash_rand - 129; object->last_read = 0; object->generation++; TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); vm_object_count++; + object_hash_rand = object->hash_rand; } /* @@ -336,25 +350,15 @@ vm_object_deallocate(object) robject->ref_count++; - retry: - if (robject->paging_in_progress || - object->paging_in_progress) { + while ( + robject->paging_in_progress || + object->paging_in_progress + ) { vm_object_pip_sleep(robject, "objde1"); - if (robject->paging_in_progress && - robject->type == OBJT_SWAP) { - swap_pager_sync(); - goto retry; - } - vm_object_pip_sleep(object, "objde2"); - if (object->paging_in_progress && - object->type == OBJT_SWAP) { - swap_pager_sync(); - } - goto retry; } - if( robject->ref_count == 1) { + if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; @@ -396,6 +400,7 @@ doterm: * up all previously used resources. * * The object must be locked. + * This routine may block. */ void vm_object_terminate(object) @@ -444,13 +449,13 @@ vm_object_terminate(object) /* * Now free any remaining pages. For internal objects, this also * removes them from paging queues. Don't free wired pages, just - * remove them from the object. + * remove them from the object. */ s = splvm(); while ((p = TAILQ_FIRST(&object->memq)) != NULL) { #if !defined(MAX_PERF) if (p->busy || (p->flags & PG_BUSY)) - printf("vm_object_terminate: freeing busy page\n"); + panic("vm_object_terminate: freeing busy page %p\n", p); #endif if (p->wire_count == 0) { vm_page_busy(p); @@ -566,9 +571,7 @@ rescan: } s = splvm(); - while ((p->flags & PG_BUSY) || p->busy) { - vm_page_flag_set(p, PG_WANTED | PG_REFERENCED); - tsleep(p, PVM, "vpcwai", 0); + while (vm_page_sleep_busy(p, TRUE, "vpcwai")) { if (object->generation != curgeneration) { splx(s); goto rescan; @@ -763,6 +766,12 @@ vm_object_pmap_remove(object, start, end) * vm_object_madvise: * * Implements the madvise function at the object/page level. + * + * Currently, madvise() functions are limited to the default and + * swap object types only, and also limited to only the unshared portions + * of a process's address space. MADV_FREE, certainly, could never be + * run on anything else. The others are more flexible and the code could + * be adjusted in the future to handle expanded cases for them. */ void vm_object_madvise(object, pindex, count, advise) @@ -780,22 +789,59 @@ vm_object_madvise(object, pindex, count, advise) end = pindex + count; - for (; pindex < end; pindex += 1) { + /* + * MADV_FREE special case - free any swap backing store (as well + * as resident pages later on). + */ + + if (advise == MADV_FREE) { + tobject = object; + tpindex = pindex; + while ( + (tobject->type == OBJT_DEFAULT || + tobject->type == OBJT_SWAP) && + (tobject->flags & OBJ_ONEMAPPING) + ) { + if (tobject->type == OBJT_SWAP) { + swap_pager_freespace(tobject, tpindex, count); + } + if ((tobject = tobject->backing_object) == NULL) + break; + tpindex += OFF_TO_IDX(tobject->backing_object_offset); + } + } + + /* + * Locate and adjust resident pages + */ + + for (; pindex < end; pindex += 1) { relookup: tobject = object; tpindex = pindex; shadowlookup: + + if (tobject->type != OBJT_DEFAULT && + tobject->type != OBJT_SWAP + ) { + continue; + } + + if ((tobject->flags & OBJ_ONEMAPPING) == 0) + continue; + m = vm_page_lookup(tobject, tpindex); + if (m == NULL) { - if (tobject->type != OBJT_DEFAULT) { - continue; - } - tobject = tobject->backing_object; + if (tobject == NULL) + continue; +#if 0 if ((tobject == NULL) || (tobject->ref_count != 1)) { continue; } +#endif tpindex += OFF_TO_IDX(tobject->backing_object_offset); goto shadowlookup; } @@ -805,12 +851,15 @@ shadowlookup: * we skip it. Things can break if we mess with pages * in any of the below states. */ - if (m->hold_count || m->wire_count || - m->valid != VM_PAGE_BITS_ALL) { + if ( + m->hold_count || + m->wire_count || + m->valid != VM_PAGE_BITS_ALL + ) { continue; } - if (vm_page_sleep(m, "madvpo", &m->busy)) + if (vm_page_sleep_busy(m, TRUE, "madvpo")) goto relookup; if (advise == MADV_WILLNEED) { @@ -818,15 +867,25 @@ shadowlookup: } else if (advise == MADV_DONTNEED) { vm_page_deactivate(m); } else if (advise == MADV_FREE) { - pmap_clear_modify(VM_PAGE_TO_PHYS(m)); - m->dirty = 0; /* - * Force a demand zero if attempt to read from swap. - * We currently don't handle vnode files correctly, - * and will reread stale contents unnecessarily. + * If MADV_FREE_FORCE_FREE is defined, we attempt to + * immediately free the page. Otherwise we just + * destroy any swap backing store, mark it clean, + * and stuff it into the cache. */ - if (object->type == OBJT_SWAP) - swap_pager_dmzspace(tobject, m->pindex, 1); + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + m->dirty = 0; + +#ifdef MADV_FREE_FORCE_FREE + if (tobject->resident_page_count > 1) { + vm_page_busy(m); + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + } else +#endif + { + vm_page_cache(m); + } } } } @@ -900,8 +959,7 @@ vm_object_qcollapse(object) register vm_object_t object; { register vm_object_t backing_object; - register vm_pindex_t backing_offset_index, paging_offset_index; - vm_pindex_t backing_object_paging_offset_index; + register vm_pindex_t backing_offset_index; vm_pindex_t new_pindex; register vm_page_t p, pp; register vm_size_t size; @@ -913,27 +971,39 @@ vm_object_qcollapse(object) backing_object->ref_count += 2; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); - backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset); - paging_offset_index = OFF_TO_IDX(object->paging_offset); size = object->size; + p = TAILQ_FIRST(&backing_object->memq); while (p) { vm_page_t next; + /* + * setup for loop. + * loop if the page isn't trivial. + */ + next = TAILQ_NEXT(p, listq); if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) || !p->valid || p->hold_count || p->wire_count || p->busy) { p = next; continue; } + + /* + * busy the page and move it from the backing store to the + * parent object. + */ + vm_page_busy(p); + KASSERT(p->object == object, ("vm_object_qcollapse(): object mismatch")); + new_pindex = p->pindex - backing_offset_index; if (p->pindex < backing_offset_index || new_pindex >= size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, - backing_object_paging_offset_index+p->pindex, + p->pindex, 1); vm_page_protect(p, VM_PROT_NONE); vm_page_free(p); @@ -941,16 +1011,16 @@ vm_object_qcollapse(object) pp = vm_page_lookup(object, new_pindex); if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object, - paging_offset_index + new_pindex, NULL, NULL))) { + new_pindex, NULL, NULL))) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, - backing_object_paging_offset_index + p->pindex, 1); + p->pindex, 1); vm_page_protect(p, VM_PROT_NONE); vm_page_free(p); } else { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, - backing_object_paging_offset_index + p->pindex, 1); + p->pindex, 1); if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); @@ -958,7 +1028,7 @@ vm_object_qcollapse(object) vm_page_protect(p, VM_PROT_NONE); vm_page_rename(p, object, new_pindex); - p->dirty = VM_PAGE_BITS_ALL; + /* page automatically made dirty by rename */ } } p = next; @@ -1049,9 +1119,10 @@ vm_object_collapse(object) */ while ((p = TAILQ_FIRST(&backing_object->memq)) != 0) { - - new_pindex = p->pindex - backing_offset_index; + if (vm_page_sleep_busy(p, TRUE, "vmocol")) + continue; vm_page_busy(p); + new_pindex = p->pindex - backing_offset_index; /* * If the parent has a page here, or if this @@ -1068,7 +1139,7 @@ vm_object_collapse(object) } else { pp = vm_page_lookup(object, new_pindex); if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object, - OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) { + new_pindex, NULL, NULL))) { vm_page_protect(p, VM_PROT_NONE); vm_page_free(p); } else { @@ -1077,7 +1148,7 @@ vm_object_collapse(object) else vm_page_protect(p, VM_PROT_NONE); vm_page_rename(p, object, new_pindex); - p->dirty = VM_PAGE_BITS_ALL; + /* page automatically made dirty by rename */ } } } @@ -1088,52 +1159,22 @@ vm_object_collapse(object) if (backing_object->type == OBJT_SWAP) { vm_object_pip_add(backing_object, 1); - if (object->type == OBJT_SWAP) { - vm_object_pip_add(object, 1); - /* - * copy shadow object pages into ours - * and destroy unneeded pages in - * shadow object. - */ - swap_pager_copy( - backing_object, - OFF_TO_IDX(backing_object->paging_offset), - object, - OFF_TO_IDX(object->paging_offset), - OFF_TO_IDX(object->backing_object_offset), TRUE); - vm_object_pip_wakeup(object); - } else { - vm_object_pip_add(object, 1); - /* - * move the shadow backing_object's pager data to - * "object" and convert "object" type to OBJT_SWAP. - */ - object->type = OBJT_SWAP; - object->un_pager.swp.swp_nblocks = - backing_object->un_pager.swp.swp_nblocks; - object->un_pager.swp.swp_allocsize = - backing_object->un_pager.swp.swp_allocsize; - object->un_pager.swp.swp_blocks = - backing_object->un_pager.swp.swp_blocks; - object->un_pager.swp.swp_poip = /* XXX */ - backing_object->un_pager.swp.swp_poip; - object->paging_offset = backing_object->paging_offset + backing_offset; - TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list); - - /* - * Convert backing object from OBJT_SWAP to - * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is - * actually necessary. - */ - backing_object->type = OBJT_DEFAULT; - TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list); - /* - * free unnecessary blocks - */ - swap_pager_freespace(object, 0, - OFF_TO_IDX(object->paging_offset)); - vm_object_pip_wakeup(object); - } + + /* + * scrap the paging_offset junk and do a + * discrete copy. This also removes major + * assumptions about how the swap-pager + * works from where it doesn't belong. The + * new swapper is able to optimize the + * destroy-source case. + */ + + vm_object_pip_add(object, 1); + swap_pager_copy( + backing_object, + object, + OFF_TO_IDX(object->backing_object_offset), TRUE); + vm_object_pip_wakeup(object); vm_object_pip_wakeup(backing_object); } @@ -1223,7 +1264,7 @@ vm_object_collapse(object) vm_page_busy(pp); if ((pp->valid == 0) && - !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) { + !vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * Page still needed. Can't go any * further. @@ -1318,7 +1359,7 @@ again: * interrupt -- minimize the spl transitions */ - if (vm_page_sleep(p, "vmopar", &p->busy)) + if (vm_page_sleep_busy(p, TRUE, "vmopar")) goto again; if (clean_only && p->valid) { @@ -1349,7 +1390,7 @@ again: * The busy flags are only cleared at * interrupt -- minimize the spl transitions */ - if (vm_page_sleep(p, "vmopar", &p->busy)) + if (vm_page_sleep_busy(p, TRUE, "vmopar")) goto again; if (clean_only && p->valid) { @@ -1589,11 +1630,10 @@ DB_SHOW_COMMAND(object, vm_object_print_static) object, (int)object->type, (u_long)object->size, object->resident_page_count, object->ref_count, object->flags); /* - * XXX no %qd in kernel. Truncate object->paging_offset and - * object->backing_object_offset. + * XXX no %qd in kernel. Truncate object->backing_object_offset. */ - db_iprintf(" sref=%d, offset=0x%lx, backing_object(%d)=(%p)+0x%lx\n", - object->shadow_count, (long)object->paging_offset, + db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n", + object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (long)object->backing_object_offset); diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 9897393..7f54ab6 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_object.h,v 1.50 1998/08/06 08:33:19 dfr Exp $ + * $Id: vm_object.h,v 1.51 1998/08/24 08:39:37 dfr Exp $ */ /* @@ -81,6 +81,7 @@ typedef enum obj_type objtype_t; * Types defined: * * vm_object_t Virtual memory object. + * */ struct vm_object { @@ -94,32 +95,49 @@ struct vm_object { int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ int pg_color; /* color of first page in obj */ - int id; /* ID for no purpose, other than info */ +#if 0 + int id; /* ID for no purpose, other than info */ +#endif + int hash_rand; /* vm hash table randomizer */ u_short flags; /* see below */ u_short paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ u_short behavior; /* see below */ int resident_page_count; /* number of resident pages */ - int cache_count; /* number of cached pages */ - int wire_count; /* number of wired pages */ - vm_ooffset_t paging_offset; /* Offset into paging space */ + int cache_count; /* number of cached pages */ + int wire_count; /* number of wired pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ vm_offset_t last_read; /* last read in object -- detect seq behavior */ - vm_page_t page_hint; /* hint for last looked-up or allocated page */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ void *handle; union { + /* + * VNode pager + * + * vnp_size - current size of file + */ struct { - off_t vnp_size; /* Current size of file */ + off_t vnp_size; } vnp; + + /* + * Device pager + * + * devp_pglist - list of allocated pages + */ struct { - TAILQ_HEAD(, vm_page) devp_pglist; /* list of pages allocated */ + TAILQ_HEAD(, vm_page) devp_pglist; } devp; + + /* + * Swap pager + * + * swp_bcount - number of swap 'swblock' metablocks, each + * contains up to 16 swapblk assignments. + * see vm/swap_pager.h + */ struct { - int swp_nblocks; - int swp_allocsize; - struct swblock *swp_blocks; - short swp_poip; + int swp_bcount; } swp; } un_pager; }; @@ -132,7 +150,7 @@ struct vm_object { #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ #define OBJ_WRITEABLE 0x0080 /* object has been made writable */ -#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty */ +#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty */ #define OBJ_CLEANING 0x0200 #define OBJ_OPT 0x1000 /* I/O optimization */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ @@ -197,12 +215,21 @@ vm_object_pip_wakeup(vm_object_t object) } static __inline void -vm_object_pip_sleep(vm_object_t object, char *waitid) +vm_object_pip_wakeupn(vm_object_t object, int i) { - int s; + if (i) + atomic_subtract_short(&object->paging_in_progress, i); + if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { + vm_object_clear_flag(object, OBJ_PIPWNT); + wakeup(object); + } +} +static __inline void +vm_object_pip_sleep(vm_object_t object, char *waitid) +{ if (object->paging_in_progress) { - s = splvm(); + int s = splvm(); if (object->paging_in_progress) { vm_object_set_flag(object, OBJ_PIPWNT); tsleep(object, PVM, waitid, 0); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index c953559..2f0f4bd 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 - * $Id: vm_page.c,v 1.115 1999/01/08 17:31:27 eivind Exp $ + * $Id: vm_page.c,v 1.116 1999/01/10 01:58:29 eivind Exp $ */ /* @@ -83,6 +83,7 @@ #include <vm/vm_object.h> #include <vm/vm_page.h> #include <vm/vm_pageout.h> +#include <vm/vm_pager.h> #include <vm/vm_extern.h> static void vm_page_queue_init __P((void)); @@ -95,7 +96,7 @@ static vm_page_t vm_page_select_cache __P((vm_object_t, vm_pindex_t)); * page structure. */ -static struct pglist *vm_page_buckets; /* Array of buckets */ +static struct vm_page **vm_page_buckets; /* Array of buckets */ static int vm_page_bucket_count; /* How big is array? */ static int vm_page_hash_mask; /* Mask for hash function */ static volatile int vm_page_bucket_generation; @@ -162,7 +163,6 @@ static u_short vm_page_dev_bsize_chunks[] = { }; static __inline int vm_page_hash __P((vm_object_t object, vm_pindex_t pindex)); -static int vm_page_freechk_and_unqueue __P((vm_page_t m)); static void vm_page_free_wakeup __P((void)); /* @@ -206,7 +206,7 @@ vm_page_startup(starta, enda, vaddr) { register vm_offset_t mapped; register vm_page_t m; - register struct pglist *bucket; + register struct vm_page **bucket; vm_size_t npages, page_range; register vm_offset_t new_start; int i; @@ -256,24 +256,30 @@ vm_page_startup(starta, enda, vaddr) * * The number of buckets MUST BE a power of 2, and the actual value is * the next power of 2 greater than the number of physical pages in - * the system. + * the system. + * + * We make the hash table approximately 2x the number of pages to + * reduce the chain length. This is about the same size using the + * singly-linked list as the 1x hash table we were using before + * using TAILQ but the chain length will be smaller. * * Note: This computation can be tweaked if desired. */ - vm_page_buckets = (struct pglist *) vaddr; + vm_page_buckets = (struct vm_page **)vaddr; bucket = vm_page_buckets; if (vm_page_bucket_count == 0) { vm_page_bucket_count = 1; while (vm_page_bucket_count < atop(total)) vm_page_bucket_count <<= 1; } + vm_page_bucket_count <<= 1; vm_page_hash_mask = vm_page_bucket_count - 1; /* * Validate these addresses. */ - new_start = start + vm_page_bucket_count * sizeof(struct pglist); + new_start = start + vm_page_bucket_count * sizeof(struct vm_page *); new_start = round_page(new_start); mapped = round_page(vaddr); vaddr = pmap_map(mapped, start, new_start, @@ -283,7 +289,7 @@ vm_page_startup(starta, enda, vaddr) bzero((caddr_t) mapped, vaddr - mapped); for (i = 0; i < vm_page_bucket_count; i++) { - TAILQ_INIT(bucket); + *bucket = NULL; bucket++; } @@ -353,13 +359,18 @@ vm_page_startup(starta, enda, vaddr) * * NOTE: This macro depends on vm_page_bucket_count being a power of 2. * This routine may not block. + * + * We try to randomize the hash based on the object to spread the pages + * out in the hash table without it costing us too much. */ static __inline int vm_page_hash(object, pindex) vm_object_t object; vm_pindex_t pindex; { - return ((((uintptr_t) object) >> 5) + (pindex >> 1)) & vm_page_hash_mask; + int i = ((uintptr_t)object + pindex) ^ object->hash_rand; + + return(i & vm_page_hash_mask); } /* @@ -382,7 +393,7 @@ vm_page_insert(m, object, pindex) register vm_object_t object; register vm_pindex_t pindex; { - register struct pglist *bucket; + register struct vm_page **bucket; if (m->object != NULL) panic("vm_page_insert: already inserted"); @@ -399,7 +410,8 @@ vm_page_insert(m, object, pindex) */ bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; - TAILQ_INSERT_TAIL(bucket, m, hashq); + m->hnext = *bucket; + *bucket = m; vm_page_bucket_generation++; /* @@ -407,7 +419,9 @@ vm_page_insert(m, object, pindex) */ TAILQ_INSERT_TAIL(&object->memq, m, listq); +#if 0 m->object->page_hint = m; +#endif m->object->generation++; if (m->wire_count) @@ -417,50 +431,48 @@ vm_page_insert(m, object, pindex) object->cache_count++; /* - * And show that the object has one more resident page. + * show that the object has one more resident page. */ object->resident_page_count++; } /* - * vm_page_remove: [ internal use only ] + * vm_page_remove: * NOTE: used by device pager as well -wfj * * Removes the given mem entry from the object/offset-page - * table and the object page list. + * table and the object page list, but do not invalidate/terminate + * the backing store. * * The object and page must be locked, and at splhigh. + * The underlying pmap entry (if any) is NOT removed here. * This routine may not block. - * - * I do not think the underlying pmap entry (if any) is removed here. */ -void +vm_object_t vm_page_remove(m) - register vm_page_t m; + vm_page_t m; { - register struct pglist *bucket; + register struct vm_page **bucket; vm_object_t object; if (m->object == NULL) - return; + return(NULL); #if !defined(MAX_PERF) if ((m->flags & PG_BUSY) == 0) { panic("vm_page_remove: page not busy"); } #endif - - vm_page_flag_clear(m, PG_BUSY); - if (m->flags & PG_WANTED) { - vm_page_flag_clear(m, PG_WANTED); - wakeup(m); - } + + /* + * Basically destroy the page. + */ + + vm_page_wakeup(m); object = m->object; - if (object->page_hint == m) - object->page_hint = NULL; if (m->wire_count) object->wire_count--; @@ -469,11 +481,23 @@ vm_page_remove(m) object->cache_count--; /* - * Remove from the object_object/offset hash table + * Remove from the object_object/offset hash table. The object + * must be on the hash queue, we will panic if it isn't + * + * Note: we must NULL-out m->hnext to prevent loops in detached + * buffers with vm_page_lookup(). */ bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)]; - TAILQ_REMOVE(bucket, m, hashq); + while (*bucket != m) { +#if !defined(MAX_PERF) + if (*bucket == NULL) + panic("vm_page_remove(): page not found in hash"); +#endif + bucket = &(*bucket)->hnext; + } + *bucket = m->hnext; + m->hnext = NULL; vm_page_bucket_generation++; /* @@ -490,6 +514,8 @@ vm_page_remove(m) object->generation++; m->object = NULL; + + return(object); } /* @@ -498,8 +524,14 @@ vm_page_remove(m) * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * + * NOTE: the code below does not lock. It will operate properly if + * an interrupt makes a change, but the generation algorithm will not + * operate properly in an SMP environment where both cpu's are able to run + * kernel code simultaniously. + * * The object must be locked. No side effects. * This routine may not block. + * This is a critical path routine */ vm_page_t @@ -508,25 +540,29 @@ vm_page_lookup(object, pindex) register vm_pindex_t pindex; { register vm_page_t m; - register struct pglist *bucket; + register struct vm_page **bucket; int generation; /* * Search the hash table for this object/offset pair */ +#if 0 if (object->page_hint && (object->page_hint->pindex == pindex) && (object->page_hint->object == object)) return object->page_hint; +#endif retry: generation = vm_page_bucket_generation; bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; - for (m = TAILQ_FIRST(bucket); m != NULL; m = TAILQ_NEXT(m,hashq)) { + for (m = *bucket; m != NULL; m = m->hnext) { if ((m->object == object) && (m->pindex == pindex)) { if (vm_page_bucket_generation != generation) goto retry; +#if 0 m->object->page_hint = m; +#endif return (m); } } @@ -545,6 +581,16 @@ retry: * This routine may not block. * * Note: this routine will raise itself to splvm(), the caller need not. + * + * Note: swap associated with the page must be invalidated by the move. We + * have to do this for several reasons: (1) we aren't freeing the + * page, (2) we are dirtying the page, (3) the VM system is probably + * moving the page from object A to B, and will then later move + * the backing store from A to B and we can't have a conflict. + * + * Note: we *always* dirty the page. It is necessary both for the + * fact that we moved it, and because we may be invalidating + * swap. */ void @@ -558,6 +604,7 @@ vm_page_rename(m, new_object, new_pindex) s = splvm(); vm_page_remove(m); vm_page_insert(m, new_object, new_pindex); + m->dirty = VM_PAGE_BITS_ALL; splx(s); } @@ -625,6 +672,12 @@ vm_page_unqueue(m) * * Find a page on the specified queue with color optimization. * + * The page coloring optimization attempts to locate a page + * that does not overload other nearby pages in the object in + * the cpu's L1 or L2 caches. We need this optmization because + * cpu caches tend to be physical caches, while object spaces tend + * to be virtual. + * * This routine must be called at splvm(). * This routine may not block. */ @@ -759,7 +812,10 @@ vm_page_select_free(object, pindex, prefqueue) int i,j; int index, hindex; #endif - vm_page_t m, mh; + vm_page_t m; +#if 0 + vm_page_t mh; +#endif int oqueuediff; struct vpgqueues *pq; @@ -768,6 +824,7 @@ vm_page_select_free(object, pindex, prefqueue) else oqueuediff = PQ_ZERO - PQ_FREE; +#if 0 if (mh = object->page_hint) { if (mh->pindex == (pindex - 1)) { if ((mh->flags & PG_FICTITIOUS) == 0) { @@ -785,6 +842,7 @@ vm_page_select_free(object, pindex, prefqueue) } } } +#endif pq = &vm_page_queues[prefqueue]; @@ -857,6 +915,8 @@ vm_page_select_free(object, pindex, prefqueue) * Additional special handling is required when called from an * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with * the page cache in this case. + * + * vm_page_alloc() */ vm_page_t vm_page_alloc(object, pindex, page_req) @@ -864,7 +924,7 @@ vm_page_alloc(object, pindex, page_req) vm_pindex_t pindex; int page_req; { - register vm_page_t m; + register vm_page_t m = NULL; struct vpgqueues *pq; vm_object_t oldobject; int queue, qtype; @@ -873,12 +933,17 @@ vm_page_alloc(object, pindex, page_req) KASSERT(!vm_page_lookup(object, pindex), ("vm_page_alloc: page already allocated")); + /* + * The pager is allowed to eat deeper into the free page list. + */ + if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) { page_req = VM_ALLOC_SYSTEM; }; s = splvm(); +loop: switch (page_req) { case VM_ALLOC_NORMAL: @@ -961,20 +1026,36 @@ vm_page_alloc(object, pindex, page_req) queue = m->queue; qtype = queue - m->pc; - if (qtype == PQ_ZERO) - vm_page_zero_count--; + + /* + * Cache pages must be formally freed (and doubly so with the + * new pagerops functions). We free the page and try again. + * + * This also has the side effect of ensuring that the minfreepage + * wall is held more tightly verses the old code. + */ + + if (qtype == PQ_CACHE) { +#if !defined(MAX_PERF) + if (m->dirty) + panic("found dirty cache page %p", m); + +#endif + vm_page_busy(m); + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + goto loop; + } + pq = &vm_page_queues[queue]; TAILQ_REMOVE(pq->pl, m, pageq); (*pq->cnt)--; (*pq->lcnt)--; oldobject = NULL; + if (qtype == PQ_ZERO) { + vm_page_zero_count--; m->flags = PG_ZERO | PG_BUSY; - } else if (qtype == PQ_CACHE) { - oldobject = m->object; - vm_page_busy(m); - vm_page_remove(m); - m->flags = PG_BUSY; } else { m->flags = PG_BUSY; } @@ -1004,6 +1085,12 @@ vm_page_alloc(object, pindex, page_req) (cnt.v_free_count < cnt.v_pageout_free_min)) pagedaemon_wakeup(); +#if 0 + /* + * (code removed - was previously a manual breakout of the act of + * freeing a page from cache. We now just call vm_page_free() on + * a cache page an loop so this code no longer needs to be here) + */ if ((qtype == PQ_CACHE) && ((page_req == VM_ALLOC_NORMAL) || (page_req == VM_ALLOC_ZERO)) && oldobject && (oldobject->type == OBJT_VNODE) && @@ -1017,6 +1104,7 @@ vm_page_alloc(object, pindex, page_req) } } } +#endif splx(s); return (m); @@ -1048,6 +1136,33 @@ vm_wait() } /* + * vm_await: (also see VM_AWAIT macro) + * + * asleep on an event that will signal when free pages are available + * for allocation. + */ + +void +vm_await() +{ + int s; + + s = splvm(); + if (curproc == pageproc) { + vm_pageout_pages_needed = 1; + asleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0); + } else { + if (!vm_pages_needed) { + vm_pages_needed++; + wakeup(&vm_pages_needed); + } + asleep(&cnt.v_free_count, PVM, "vmwait", 0); + } + splx(s); +} + +#if 0 +/* * vm_page_sleep: * * Block until page is no longer busy. @@ -1069,6 +1184,38 @@ vm_page_sleep(vm_page_t m, char *msg, char *busy) { return slept; } +#endif + +#if 0 + +/* + * vm_page_asleep: + * + * Similar to vm_page_sleep(), but does not block. Returns 0 if + * the page is not busy, or 1 if the page is busy. + * + * This routine has the side effect of calling asleep() if the page + * was busy (1 returned). + */ + +int +vm_page_asleep(vm_page_t m, char *msg, char *busy) { + int slept = 0; + if ((busy && *busy) || (m->flags & PG_BUSY)) { + int s; + s = splvm(); + if ((busy && *busy) || (m->flags & PG_BUSY)) { + vm_page_flag_set(m, PG_WANTED); + asleep(m, PVM, msg, 0); + slept = 1; + } + splx(s); + } + return slept; +} + +#endif + /* * vm_page_activate: * @@ -1111,13 +1258,49 @@ vm_page_activate(m) * * This routine may not block. */ -static int -vm_page_freechk_and_unqueue(m) - vm_page_t m; +static __inline void +vm_page_free_wakeup() { - vm_object_t oldobject; + /* + * if pageout daemon needs pages, then tell it that there are + * some free. + */ + if (vm_pageout_pages_needed) { + wakeup(&vm_pageout_pages_needed); + vm_pageout_pages_needed = 0; + } + /* + * wakeup processes that are waiting on memory if we hit a + * high water mark. And wakeup scheduler process if we have + * lots of memory. this process will swapin processes. + */ + if (vm_pages_needed && + ((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) { + wakeup(&cnt.v_free_count); + vm_pages_needed = 0; + } +} - oldobject = m->object; +/* + * vm_page_free_toq: + * + * Returns the given page to the PQ_FREE or PQ_ZERO list, + * disassociating it with any VM object. + * + * Object and page must be locked prior to entry. + * This routine may not block. + */ + +void +vm_page_free_toq(vm_page_t m, int queue) +{ + int s; + struct vpgqueues *pq; + vm_object_t object = m->object; + + s = splvm(); + + cnt.v_tfree++; #if !defined(MAX_PERF) if (m->busy || ((m->queue - m->pc) == PQ_FREE) || @@ -1133,11 +1316,24 @@ vm_page_freechk_and_unqueue(m) } #endif + /* + * unqueue, then remove page. Note that we cannot destroy + * the page here because we do not want to call the pager's + * callback routine until after we've put the page on the + * appropriate free queue. + */ + vm_page_unqueue_nowakeup(m); vm_page_remove(m); + /* + * If fictitious remove object association and + * return, otherwise delay object association removal. + */ + if ((m->flags & PG_FICTITIOUS) != 0) { - return 0; + splx(s); + return; } m->valid = 0; @@ -1156,10 +1352,17 @@ vm_page_freechk_and_unqueue(m) cnt.v_wire_count--; } - if (oldobject && (oldobject->type == OBJT_VNODE) && - ((oldobject->flags & OBJ_DEAD) == 0)) { - struct vnode *vp; - vp = (struct vnode *) oldobject->handle; + /* + * If we've exhausted the object's resident pages we want to free + * it up. + */ + + if (object && + (object->type == OBJT_VNODE) && + ((object->flags & OBJ_DEAD) == 0) + ) { + struct vnode *vp = (struct vnode *)object->handle; + if (vp && VSHOULDFREE(vp)) { if ((vp->v_flag & (VTBFREE|VDOOMED|VFREE)) == 0) { TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist); @@ -1172,107 +1375,31 @@ vm_page_freechk_and_unqueue(m) pmap_page_is_free(m); #endif - return 1; -} - -/* - * helper routine for vm_page_free and vm_page_free_zero. - * - * This routine may not block. - */ -static __inline void -vm_page_free_wakeup() -{ - -/* - * if pageout daemon needs pages, then tell it that there are - * some free. - */ - if (vm_pageout_pages_needed) { - wakeup(&vm_pageout_pages_needed); - vm_pageout_pages_needed = 0; - } - /* - * wakeup processes that are waiting on memory if we hit a - * high water mark. And wakeup scheduler process if we have - * lots of memory. this process will swapin processes. - */ - if (vm_pages_needed && - ((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) { - wakeup(&cnt.v_free_count); - vm_pages_needed = 0; - } -} - -/* - * vm_page_free: - * - * Returns the given page to the free list, - * disassociating it with any VM object. - * - * Object and page must be locked prior to entry. - * This routine may not block. - */ -void -vm_page_free(m) - register vm_page_t m; -{ - int s; - struct vpgqueues *pq; - - s = splvm(); - - cnt.v_tfree++; - - if (!vm_page_freechk_and_unqueue(m)) { - splx(s); - return; - } - - m->queue = PQ_FREE + m->pc; + m->queue = queue + m->pc; pq = &vm_page_queues[m->queue]; ++(*pq->lcnt); ++(*pq->cnt); - /* - * If the pageout process is grabbing the page, it is likely - * that the page is NOT in the cache. It is more likely that - * the page will be partially in the cache if it is being - * explicitly freed. - */ - if (curproc == pageproc) { - TAILQ_INSERT_TAIL(pq->pl, m, pageq); - } else { - TAILQ_INSERT_HEAD(pq->pl, m, pageq); - } - vm_page_free_wakeup(); - splx(s); -} - -void -vm_page_free_zero(m) - register vm_page_t m; -{ - int s; - struct vpgqueues *pq; - - s = splvm(); - - cnt.v_tfree++; + if (queue == PQ_ZERO) { + TAILQ_INSERT_HEAD(pq->pl, m, pageq); + ++vm_page_zero_count; + } else { + /* + * If the pageout process is grabbing the page, it is likely + * that the page is NOT in the cache. It is more likely that + * the page will be partially in the cache if it is being + * explicitly freed. + */ - if (!vm_page_freechk_and_unqueue(m)) { - splx(s); - return; + if (curproc == pageproc) { + TAILQ_INSERT_TAIL(pq->pl, m, pageq); + } else { + TAILQ_INSERT_HEAD(pq->pl, m, pageq); + } } - m->queue = PQ_ZERO + m->pc; - pq = &vm_page_queues[m->queue]; - ++(*pq->lcnt); - ++(*pq->cnt); - - TAILQ_INSERT_HEAD(pq->pl, m, pageq); - ++vm_page_zero_count; vm_page_free_wakeup(); + splx(s); } @@ -1311,6 +1438,17 @@ vm_page_wire(m) * Release one wiring of this page, potentially * enabling it to be paged again. * + * Many pages placed on the inactive queue should actually go + * into the cache, but it is difficult to figure out which. What + * we do instead, if the inactive target is well met, is to put + * clean pages at the head of the inactive queue instead of the tail. + * This will cause them to be moved to the cache more quickly and + * if not actively re-referenced, freed more quickly. If we just + * stick these pages at the end of the inactive queue, heavy filesystem + * meta-data accesses can cause an unnecessary paging load on memory bound + * processes. This optimization causes one-time-use metadata to be + * reused more quickly. + * * The page queues must be locked. * This routine may not block. */ @@ -1351,7 +1489,8 @@ vm_page_unwire(m, activate) /* - * Move the specified page to the inactive queue. + * Move the specified page to the inactive queue. If the page has + * any associated swap, the swap is deallocated. * * This routine may not block. */ @@ -1383,7 +1522,8 @@ vm_page_deactivate(m) /* * vm_page_cache * - * Put the specified page onto the page cache queue (if appropriate). + * Put the specified page onto the page cache queue (if appropriate). + * * This routine may not block. */ void @@ -1624,7 +1764,7 @@ again1: } next = TAILQ_NEXT(m, pageq); - if (vm_page_sleep(m, "vpctw0", &m->busy)) + if (vm_page_sleep_busy(m, TRUE, "vpctw0")) goto again1; vm_page_test_dirty(m); if (m->dirty) { @@ -1652,7 +1792,7 @@ again1: } next = TAILQ_NEXT(m, pageq); - if (vm_page_sleep(m, "vpctw1", &m->busy)) + if (vm_page_sleep_busy(m, TRUE, "vpctw1")) goto again1; vm_page_test_dirty(m); if (m->dirty) { diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 3149391..f9e4926 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_page.h,v 1.48 1998/10/28 13:37:02 dg Exp $ + * $Id: vm_page.h,v 1.49 1999/01/08 17:31:28 eivind Exp $ */ /* @@ -105,10 +105,10 @@ TAILQ_HEAD(pglist, vm_page); struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO queue or free list (P) */ - TAILQ_ENTRY(vm_page) hashq; /* hash table links (O) */ - TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ + struct vm_page *hnext; /* hash table link (O,P) */ + TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ - vm_object_t object; /* which object am I in (O,P) */ + vm_object_t object; /* which object am I in (O,P)*/ vm_pindex_t pindex; /* offset into object (O,P) */ vm_offset_t phys_addr; /* physical address of page */ u_short queue; /* page queue index */ @@ -130,6 +130,13 @@ struct vm_page { }; /* + * note SWAPBLK_NONE is a flag, basically the high bit. + */ + +#define SWAPBLK_MASK ((daddr_t)((u_daddr_t)-1 >> 1)) /* mask */ +#define SWAPBLK_NONE ((daddr_t)((u_daddr_t)SWAPBLK_MASK + 1))/* flag */ + +/* * Page coloring parameters */ /* Each of PQ_FREE, PQ_ZERO and PQ_CACHE have PQ_HASH_SIZE entries */ @@ -201,14 +208,15 @@ extern struct vpgqueues { * * Note: PG_FILLED and PG_DIRTY are added for the filesystems. */ -#define PG_BUSY 0x01 /* page is in transit (O) */ -#define PG_WANTED 0x02 /* someone is waiting for page (O) */ -#define PG_FICTITIOUS 0x08 /* physical page doesn't exist (O) */ -#define PG_WRITEABLE 0x10 /* page is mapped writeable */ -#define PG_MAPPED 0x20 /* page is mapped */ -#define PG_ZERO 0x40 /* page is zeroed */ -#define PG_REFERENCED 0x80 /* page has been referenced */ -#define PG_CLEANCHK 0x100 /* page will be checked for cleaning */ +#define PG_BUSY 0x0001 /* page is in transit (O) */ +#define PG_WANTED 0x0002 /* someone is waiting for page (O) */ +#define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */ +#define PG_WRITEABLE 0x0010 /* page is mapped writeable */ +#define PG_MAPPED 0x0020 /* page is mapped */ +#define PG_ZERO 0x0040 /* page is zeroed */ +#define PG_REFERENCED 0x0080 /* page has been referenced */ +#define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */ +#define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */ /* * Misc constants. @@ -307,16 +315,36 @@ vm_page_busy(vm_page_t m) vm_page_flag_set(m, PG_BUSY); } +/* + * vm_page_flash: + * + * wakeup anyone waiting for the page. + */ + static __inline void -vm_page_wakeup(vm_page_t m) +vm_page_flash(vm_page_t m) { - vm_page_flag_clear(m, PG_BUSY); if (m->flags & PG_WANTED) { vm_page_flag_clear(m, PG_WANTED); wakeup(m); } } +/* + * vm_page_wakeup: + * + * clear the PG_BUSY flag and wakeup anyone waiting for the + * page. + * + */ + +static __inline void +vm_page_wakeup(vm_page_t m) +{ + vm_page_flag_clear(m, PG_BUSY); + vm_page_flash(m); +} + static __inline void vm_page_io_start(vm_page_t m) { @@ -327,10 +355,8 @@ static __inline void vm_page_io_finish(vm_page_t m) { atomic_subtract_char(&m->busy, 1); - if ((m->flags & PG_WANTED) && m->busy == 0) { - vm_page_flag_clear(m, PG_WANTED); - wakeup(m); - } + if (m->busy == 0) + vm_page_flash(m); } @@ -353,12 +379,13 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); +static __inline void vm_page_free __P((vm_page_t)); +static __inline void vm_page_free_zero __P((vm_page_t)); +void vm_page_destroy __P((vm_page_t)); void vm_page_deactivate __P((vm_page_t)); -void vm_page_free __P((vm_page_t)); -void vm_page_free_zero __P((vm_page_t)); void vm_page_insert __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_page_t vm_page_lookup __P((vm_object_t, vm_pindex_t)); -void vm_page_remove __P((vm_page_t)); +vm_object_t vm_page_remove __P((vm_page_t)); void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); void vm_page_unwire __P((vm_page_t, int)); @@ -374,7 +401,11 @@ int vm_page_bits __P((int, int)); vm_page_t vm_page_list_find __P((int, int)); int vm_page_queue_index __P((vm_offset_t, int)); vm_page_t vm_page_select __P((vm_object_t, vm_pindex_t, int)); +#if 0 int vm_page_sleep(vm_page_t m, char *msg, char *busy); +int vm_page_asleep(vm_page_t m, char *msg, char *busy); +#endif +void vm_page_free_toq(vm_page_t m, int queue); /* * Keep page from being freed by the page daemon @@ -438,5 +469,64 @@ vm_page_copy(src_m, dest_m) dest_m->valid = VM_PAGE_BITS_ALL; } +/* + * vm_page_free: + * + * Free a page + */ +static __inline void +vm_page_free(m) + vm_page_t m; +{ + vm_page_free_toq(m, PQ_FREE); +} + +/* + * vm_page_free_zero: + * + * Free a page to the zerod-pages queue + */ +static __inline void +vm_page_free_zero(m) + vm_page_t m; +{ + vm_page_free_toq(m, PQ_ZERO); +} + +/* + * vm_page_sleep_busy: + * + * Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE) + * m->busy is zero. Returns TRUE if it had to sleep ( including if + * it almost had to sleep and made temporary spl*() mods), FALSE + * otherwise. + * + * This routine assumes that interrupts can only remove the busy + * status from a page, not set the busy status or change it from + * PG_BUSY to m->busy or vise versa (which would create a timing + * window). + * + * Note that being an inline, this code will be well optimized. + */ + +static __inline int +vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg) +{ + if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) { + int s = splvm(); + if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) { + /* + * Page is busy. Wait and retry. + */ + vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); + tsleep(m, PVM, msg, 0); + } + splx(s); + return(TRUE); + /* not reached */ + } + return(FALSE); +} + #endif /* KERNEL */ #endif /* !_VM_PAGE_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 606981f..06f24d6 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -65,7 +65,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pageout.c,v 1.128 1998/10/25 17:44:59 phk Exp $ + * $Id: vm_pageout.c,v 1.129 1998/10/31 17:21:31 peter Exp $ */ /* @@ -211,13 +211,10 @@ void pmap_collect(void); * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to - * block. - * - * And we set pageout-in-progress to keep the object from disappearing - * during pageout. This guarantees that the page won't move from the - * inactive queue. (However, any other page on the inactive queue may - * move!) + * block. Note the careful timing, however, the busy bit isn't set till + * late and we cannot do anything that will mess with the page. */ + static int vm_pageout_clean(m) vm_page_t m; @@ -231,12 +228,23 @@ vm_pageout_clean(m) object = m->object; /* + * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP + * with the new swapper, but we could have serious problems paging + * out other object types if there is insufficient memory. + * + * Unfortunately, checking free memory here is far too late, so the + * check has been moved up a procedural level. + */ + +#if 0 + /* * If not OBJT_SWAP, additional memory may be needed to do the pageout. * Try to avoid the deadlock. */ if ((object->type == OBJT_DEFAULT) && ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)) return 0; +#endif /* * Don't mess with the page if it's busy. @@ -245,12 +253,21 @@ vm_pageout_clean(m) ((m->busy != 0) || (m->flags & PG_BUSY))) return 0; +#if 0 + /* + * XXX REMOVED XXX. vm_object_collapse() can block, which can + * change the page state. Calling vm_object_collapse() might also + * destroy or rename the page because we have not busied it yet!!! + * So this code segment is removed. + */ /* - * Try collapsing before it's too late. + * Try collapsing before it's too late. XXX huh? Why are we doing + * this here? */ if (object->backing_object) { vm_object_collapse(object); } +#endif mc[vm_pageout_page_count] = m; pageout_count = 1; @@ -351,6 +368,16 @@ do_backward: return vm_pageout_flush(&mc[page_base], pageout_count, 0); } +/* + * vm_pageout_flush() - launder the given pages + * + * The given pages are laundered. Note that we setup for the start of + * I/O ( i.e. busy the page ), mark it read-only, and bump the object + * reference count all in here rather then in the parent. If we want + * the parent to do more sophisticated things we may have to change + * the ordering. + */ + int vm_pageout_flush(mc, count, flags) vm_page_t *mc; @@ -362,6 +389,14 @@ vm_pageout_flush(mc, count, flags) int numpagedout = 0; int i; + /* + * Initiate I/O. Bump the vm_page_t->busy counter and + * mark the pages read-only. + * + * We do not have to fixup the clean/dirty bits here... we can + * allow the pager to do it after the I/O completes. + */ + for (i = 0; i < count; i++) { vm_page_io_start(mc[i]); vm_page_protect(mc[i], VM_PROT_READ); @@ -585,25 +620,24 @@ vm_pageout_map_deactivate_pages(map, desired) } #endif +/* + * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore + * to vnode deadlocks. We only do it for OBJT_DEFAULT and OBJT_SWAP objects + * which we know can be trivially freed. + */ + void vm_pageout_page_free(vm_page_t m) { - struct vnode *vp; - vm_object_t object; - - object = m->object; - object->ref_count++; - - if (object->type == OBJT_VNODE) { - vp = object->handle; - vp->v_usecount++; - if (VSHOULDBUSY(vp)) - vbusy(vp); - } + vm_object_t object = m->object; + int type = object->type; + if (type == OBJT_SWAP || type == OBJT_DEFAULT) + vm_object_reference(object); vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); - vm_object_deallocate(object); + if (type == OBJT_SWAP || type == OBJT_DEFAULT) + vm_object_deallocate(object); } /* @@ -613,9 +647,10 @@ static int vm_pageout_scan() { vm_page_t m, next; - int page_shortage, addl_page_shortage, maxscan, pcount; + int page_shortage, maxscan, pcount; + int addl_page_shortage, addl_page_shortage_init; int maxlaunder; - int pages_freed; + int launder_loop = 0; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; @@ -629,31 +664,53 @@ vm_pageout_scan() */ pmap_collect(); - /* - * Start scanning the inactive queue for pages we can free. We keep - * scanning until we have enough free pages or we have scanned through - * the entire queue. If we encounter dirty pages, we start cleaning - * them. - */ - - pages_freed = 0; - addl_page_shortage = vm_pageout_deficit; + addl_page_shortage_init = vm_pageout_deficit; vm_pageout_deficit = 0; if (max_page_launder == 0) max_page_launder = 1; - maxlaunder = (cnt.v_inactive_target > max_page_launder) ? - max_page_launder : cnt.v_inactive_target; -rescan0: - maxscan = cnt.v_inactive_count; - for( m = TAILQ_FIRST(&vm_page_queue_inactive); + /* + * Calculate the number of pages we want to either free or move + * to the cache. + */ + + page_shortage = (cnt.v_free_target + cnt.v_cache_min) - + (cnt.v_free_count + cnt.v_cache_count); + page_shortage += addl_page_shortage_init; + + /* + * Figure out what to do with dirty pages when they are encountered. + * Assume that 1/3 of the pages on the inactive list are clean. If + * we think we can reach our target, disable laundering (do not + * clean any dirty pages). If we miss the target we will loop back + * up and do a laundering run. + */ - (m != NULL) && (maxscan-- > 0) && - ((cnt.v_cache_count + cnt.v_free_count) < - (cnt.v_cache_min + cnt.v_free_target)); + if (cnt.v_inactive_count / 3 > page_shortage) { + maxlaunder = 0; + launder_loop = 0; + } else { + maxlaunder = + (cnt.v_inactive_target > max_page_launder) ? + max_page_launder : cnt.v_inactive_target; + launder_loop = 1; + } - m = next) { + /* + * Start scanning the inactive queue for pages we can move to the + * cache or free. The scan will stop when the target is reached or + * we have scanned the entire inactive queue. + */ + +rescan0: + addl_page_shortage = addl_page_shortage_init; + maxscan = cnt.v_inactive_count; + for ( + m = TAILQ_FIRST(&vm_page_queue_inactive); + m != NULL && maxscan-- > 0 && page_shortage > 0; + m = next + ) { cnt.v_pdpages++; @@ -681,19 +738,21 @@ rescan0: } /* - * If the object is not being used, we ignore previous references. + * If the object is not being used, we ignore previous + * references. */ if (m->object->ref_count == 0) { vm_page_flag_clear(m, PG_REFERENCED); pmap_clear_reference(VM_PAGE_TO_PHYS(m)); /* - * Otherwise, if the page has been referenced while in the inactive - * queue, we bump the "activation count" upwards, making it less - * likely that the page will be added back to the inactive queue - * prematurely again. Here we check the page tables (or emulated - * bits, if any), given the upper level VM system not knowing anything - * about existing references. + * Otherwise, if the page has been referenced while in the + * inactive queue, we bump the "activation count" upwards, + * making it less likely that the page will be added back to + * the inactive queue prematurely again. Here we check the + * page tables (or emulated bits, if any), given the upper + * level VM system not knowing anything about existing + * references. */ } else if (((m->flags & PG_REFERENCED) == 0) && (actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m)))) { @@ -703,10 +762,10 @@ rescan0: } /* - * If the upper level VM system knows about any page references, - * we activate the page. We also set the "activation count" higher - * than normal so that we will less likely place pages back onto the - * inactive queue again. + * If the upper level VM system knows about any page + * references, we activate the page. We also set the + * "activation count" higher than normal so that we will less + * likely place pages back onto the inactive queue again. */ if ((m->flags & PG_REFERENCED) != 0) { vm_page_flag_clear(m, PG_REFERENCED); @@ -717,9 +776,10 @@ rescan0: } /* - * If the upper level VM system doesn't know anything about the - * page being dirty, we have to check for it again. As far as the - * VM code knows, any partially dirty pages are fully dirty. + * If the upper level VM system doesn't know anything about + * the page being dirty, we have to check for it again. As + * far as the VM code knows, any partially dirty pages are + * fully dirty. */ if (m->dirty == 0) { vm_page_test_dirty(m); @@ -733,14 +793,14 @@ rescan0: if (m->valid == 0) { vm_pageout_page_free(m); cnt.v_dfree++; - pages_freed++; + --page_shortage; /* * Clean pages can be placed onto the cache queue. */ } else if (m->dirty == 0) { vm_page_cache(m); - pages_freed++; + --page_shortage; /* * Dirty pages need to be paged out. Note that we clean @@ -763,8 +823,8 @@ rescan0: } /* - * We don't bother paging objects that are "dead". Those - * objects are in a "rundown" state. + * We don't bother paging objects that are "dead". + * Those objects are in a "rundown" state. */ if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { s = splvm(); @@ -774,10 +834,61 @@ rescan0: continue; } - if ((object->type == OBJT_VNODE) && - (object->flags & OBJ_DEAD) == 0) { + /* + * For now we protect against potential memory + * deadlocks by requiring significant memory to be + * free if the object is not OBJT_DEFAULT or OBJT_SWAP. + * We do not 'trust' any other object type to operate + * with low memory, not even OBJT_DEVICE. The VM + * allocator will special case allocations done by + * the pageout daemon so the check below actually + * does have some hysteresis in it. It isn't the best + * solution, though. + */ + + if ( + object->type != OBJT_DEFAULT && + object->type != OBJT_SWAP && + cnt.v_free_count < cnt.v_free_reserved + ) { + s = splvm(); + TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + splx(s); + continue; + } + + /* + * Presumably we have sufficient free memory to do + * the more sophisticated checks and locking required + * for vnodes. + * + * The object is already known NOT to be dead. The + * vget() may still block, though, because + * VOP_ISLOCKED() doesn't check to see if an inode + * (v_data) is associated with the vnode. If it isn't, + * vget() will load in it from disk. Worse, vget() + * may actually get stuck waiting on "inode" if another + * process is in the process of bringing the inode in. + * This is bad news for us either way. + * + * So for the moment we check v_data == NULL as a + * workaround. This means that vnodes which do not + * use v_data in the way we expect probably will not + * wind up being paged out by the pager and it will be + * up to the syncer to get them. That's better then + * us blocking here. + * + * This whole code section is bogus - we need to fix + * the vnode pager to handle vm_page_t's without us + * having to do any sophisticated VOP tests. + */ + + if (object->type == OBJT_VNODE) { vp = object->handle; + if (VOP_ISLOCKED(vp) || + vp->v_data == NULL || vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) { if ((m->queue == PQ_INACTIVE) && (m->hold_count == 0) && @@ -844,19 +955,34 @@ rescan0: } /* - * Compute the page shortage. If we are still very low on memory be - * sure that we will move a minimal amount of pages from active to - * inactive. + * If we still have a page shortage and we didn't launder anything, + * run the inactive scan again and launder something this time. + */ + + if (launder_loop == 0 && page_shortage > 0) { + launder_loop = 1; + maxlaunder = + (cnt.v_inactive_target > max_page_launder) ? + max_page_launder : cnt.v_inactive_target; + goto rescan0; + } + + /* + * Compute the page shortage from the point of view of having to + * move pages from the active queue to the inactive queue. */ + page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); page_shortage += addl_page_shortage; - if (page_shortage <= 0) { - page_shortage = 0; - } + + /* + * Scan the active queue for things we can deactivate + */ pcount = cnt.v_active_count; m = TAILQ_FIRST(&vm_page_queue_active); + while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { /* @@ -943,10 +1069,14 @@ rescan0: } s = splvm(); + /* * We try to maintain some *really* free pages, this allows interrupt - * code to be guaranteed space. + * code to be guaranteed space. Since both cache and free queues + * are considered basically 'free', moving pages from cache to free + * does not effect other calculations. */ + while (cnt.v_free_count < cnt.v_free_reserved) { static int cache_rover = 0; m = vm_page_list_find(PQ_CACHE, cache_rover); @@ -995,7 +1125,6 @@ rescan0: #endif } - /* * make sure that we have swap space -- if we are low on memory and * swap -- then kill the biggest process. @@ -1242,10 +1371,8 @@ vm_pageout() cnt.v_pdwakeups++; vm_pages_needed = 0; splx(s); - vm_pager_sync(); vm_pageout_scan(); vm_pageout_deficit = 0; - vm_pager_sync(); wakeup(&cnt.v_free_count); } } diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h index a864896..68c0561 100644 --- a/sys/vm/vm_pageout.h +++ b/sys/vm/vm_pageout.h @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pageout.h,v 1.22 1998/01/12 01:44:46 dyson Exp $ + * $Id: vm_pageout.h,v 1.23 1998/01/22 17:30:43 dyson Exp $ */ #ifndef _VM_VM_PAGEOUT_H_ @@ -100,7 +100,9 @@ extern int vm_pageout_deficit; extern void pagedaemon_wakeup __P((void)); #define VM_WAIT vm_wait() +#define VM_AWAIT vm_await() extern void vm_wait __P((void)); +extern void vm_await __P((void)); #ifdef KERNEL void vm_pageout_page __P((vm_page_t, vm_object_t)); diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index 18df05d..62fe6e8 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -61,7 +61,7 @@ * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pager.c,v 1.39 1998/10/31 15:31:29 peter Exp $ + * $Id: vm_pager.c,v 1.40 1998/11/10 09:16:27 peter Exp $ */ /* @@ -91,6 +91,8 @@ extern struct pagerops swappagerops; extern struct pagerops vnodepagerops; extern struct pagerops devicepagerops; +int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ + static int dead_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static vm_object_t dead_pager_alloc __P((void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t)); @@ -164,14 +166,15 @@ struct pagerops deadpagerops = { NULL }; -static struct pagerops *pagertab[] = { +struct pagerops *pagertab[] = { &defaultpagerops, /* OBJT_DEFAULT */ &swappagerops, /* OBJT_SWAP */ &vnodepagerops, /* OBJT_VNODE */ &devicepagerops, /* OBJT_DEVICE */ &deadpagerops /* OBJT_DEAD */ }; -static int npagers = sizeof(pagertab) / sizeof(pagertab[0]); + +int npagers = sizeof(pagertab) / sizeof(pagertab[0]); /* * Kernel address space for mapping pages. @@ -217,6 +220,8 @@ vm_pager_bufferinit() bp->b_xflags = 0; } + cluster_pbuf_freecnt = nswbuf / 2; + swapbkva = kmem_alloc_pageable(pager_map, nswbuf * MAXPHYS); if (!swapbkva) panic("Not enough pager_map VM space for physical buffers"); @@ -246,41 +251,21 @@ vm_pager_deallocate(object) (*pagertab[object->type]->pgo_dealloc) (object); } +/* + * vm_pager_get_pages() - inline, see vm/vm_pager.h + * vm_pager_put_pages() - inline, see vm/vm_pager.h + * vm_pager_has_page() - inline, see vm/vm_pager.h + * vm_pager_page_inserted() - inline, see vm/vm_pager.h + * vm_pager_page_removed() - inline, see vm/vm_pager.h + */ -int -vm_pager_get_pages(object, m, count, reqpage) - vm_object_t object; - vm_page_t *m; - int count; - int reqpage; -{ - return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage)); -} - -int -vm_pager_put_pages(object, m, count, flags, rtvals) - vm_object_t object; - vm_page_t *m; - int count; - int flags; - int *rtvals; -{ - return ((*pagertab[object->type]->pgo_putpages)(object, m, count, flags, rtvals)); -} - -boolean_t -vm_pager_has_page(object, offset, before, after) - vm_object_t object; - vm_pindex_t offset; - int *before; - int *after; -{ - return ((*pagertab[object->type]->pgo_haspage) (object, offset, before, after)); -} - +#if 0 /* - * Called by pageout daemon before going back to sleep. - * Gives pagers a chance to clean up any completed async pageing operations. + * vm_pager_sync: + * + * Called by pageout daemon before going back to sleep. + * Gives pagers a chance to clean up any completed async pageing + * operations. */ void vm_pager_sync() @@ -292,6 +277,8 @@ vm_pager_sync() (*(*pgops)->pgo_sync) (); } +#endif + vm_offset_t vm_pager_map_page(m) vm_page_t m; @@ -342,20 +329,42 @@ initpbuf(struct buf *bp) { /* * allocate a physical buffer + * + * There are a limited number (nswbuf) of physical buffers. We need + * to make sure that no single subsystem is able to hog all of them, + * so each subsystem implements a counter which is typically initialized + * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and + * increments it on release, and blocks if the counter hits zero. A + * subsystem may initialize the counter to -1 to disable the feature, + * but it must still be sure to match up all uses of getpbuf() with + * relpbuf() using the same variable. + * + * NOTE: pfreecnt can be NULL, but this 'feature' will be removed + * relatively soon when the rest of the subsystems get smart about it. XXX */ struct buf * -getpbuf() +getpbuf(pfreecnt) + int *pfreecnt; { int s; struct buf *bp; s = splvm(); + + if (pfreecnt) { + while (*pfreecnt == 0) { + tsleep(pfreecnt, PVM, "wswbuf0", 0); + } + } + /* get a bp from the swap buffer header pool */ while ((bp = TAILQ_FIRST(&bswlist)) == NULL) { bswneeded = 1; - tsleep(&bswneeded, PVM, "wswbuf", 0); + tsleep(&bswneeded, PVM, "wswbuf1", 0); } TAILQ_REMOVE(&bswlist, bp, b_freelist); + if (pfreecnt) + --*pfreecnt; splx(s); initpbuf(bp); @@ -363,20 +372,27 @@ getpbuf() } /* - * allocate a physical buffer, if one is available + * allocate a physical buffer, if one is available. + * + * Note that there is no NULL hack here - all subsystems using this + * call understand how to use pfreecnt. */ struct buf * -trypbuf() +trypbuf(pfreecnt) + int *pfreecnt; { int s; struct buf *bp; s = splvm(); - if ((bp = TAILQ_FIRST(&bswlist)) == NULL) { + if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { splx(s); return NULL; } TAILQ_REMOVE(&bswlist, bp, b_freelist); + + --*pfreecnt; + splx(s); initpbuf(bp); @@ -386,10 +402,14 @@ trypbuf() /* * release a physical buffer + * + * NOTE: pfreecnt can be NULL, but this 'feature' will be removed + * relatively soon when the rest of the subsystems get smart about it. XXX */ void -relpbuf(bp) +relpbuf(bp, pfreecnt) struct buf *bp; + int *pfreecnt; { int s; @@ -403,6 +423,7 @@ relpbuf(bp) crfree(bp->b_wcred); bp->b_wcred = NOCRED; } + if (bp->b_vp) pbrelvp(bp); @@ -415,5 +436,9 @@ relpbuf(bp) bswneeded = 0; wakeup(&bswneeded); } + if (pfreecnt) { + if (++*pfreecnt == 1) + wakeup(pfreecnt); + } splx(s); } diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h index 6b8eb42..0e8d894 100644 --- a/sys/vm/vm_pager.h +++ b/sys/vm/vm_pager.h @@ -36,7 +36,7 @@ * SUCH DAMAGE. * * @(#)vm_pager.h 8.4 (Berkeley) 1/12/94 - * $Id: vm_pager.h,v 1.16 1998/03/07 21:37:27 dyson Exp $ + * $Id: vm_pager.h,v 1.17 1998/10/13 08:24:44 dg Exp $ */ /* @@ -57,7 +57,7 @@ struct pagerops { int (*pgo_getpages) __P((vm_object_t, vm_page_t *, int, int)); /* Get (read) page. */ int (*pgo_putpages) __P((vm_object_t, vm_page_t *, int, int, int *)); /* Put (write) page. */ boolean_t (*pgo_haspage) __P((vm_object_t, vm_pindex_t, int *, int *)); /* Does pager have page? */ - void (*pgo_sync) __P((void)); + void (*pgo_pageunswapped) __P((vm_page_t)); }; /* @@ -87,20 +87,69 @@ MALLOC_DECLARE(M_VMPGDATA); extern vm_map_t pager_map; extern int pager_map_size; +extern struct pagerops *pagertab[]; vm_object_t vm_pager_allocate __P((objtype_t, void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t)); void vm_pager_bufferinit __P((void)); void vm_pager_deallocate __P((vm_object_t)); -int vm_pager_get_pages __P((vm_object_t, vm_page_t *, int, int)); -boolean_t vm_pager_has_page __P((vm_object_t, vm_pindex_t, int *, int *)); +static __inline int vm_pager_get_pages __P((vm_object_t, vm_page_t *, int, int)); +static __inline boolean_t vm_pager_has_page __P((vm_object_t, vm_pindex_t, int *, int *)); void vm_pager_init __P((void)); vm_object_t vm_pager_object_lookup __P((struct pagerlst *, void *)); vm_offset_t vm_pager_map_pages __P((vm_page_t *, int, boolean_t)); vm_offset_t vm_pager_map_page __P((vm_page_t)); -int vm_pager_put_pages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); +static __inline int vm_pager_put_pages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); void vm_pager_sync __P((void)); void vm_pager_unmap_pages __P((vm_offset_t, int)); void vm_pager_unmap_page __P((vm_offset_t)); + +static __inline int +vm_pager_get_pages( + vm_object_t object, + vm_page_t *m, + int count, + int reqpage +) { + return ((*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage)); +} + +static __inline int +vm_pager_put_pages( + vm_object_t object, + vm_page_t *m, + int count, + int flags, + int *rtvals +) { + return ((*pagertab[object->type]->pgo_putpages)(object, m, count, flags, rtvals)); +} + +static __inline boolean_t +vm_pager_has_page( + vm_object_t object, + vm_pindex_t offset, + int *before, + int *after +) { + return ((*pagertab[object->type]->pgo_haspage) (object, offset, before, after)); +} + +/* + * vm_pager_page_unswapped + * + * called at splvm() to destroy swap associated with the page. + * + * This function may not block. + */ + +static __inline void +vm_pager_page_unswapped(vm_page_t m) +{ + if (pagertab[m->object->type]->pgo_pageunswapped) + (*pagertab[m->object->type]->pgo_pageunswapped)(m); +} + + #endif #endif /* _VM_PAGER_ */ diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c index bfcebdc..f973631 100644 --- a/sys/vm/vm_swap.c +++ b/sys/vm/vm_swap.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 - * $Id: vm_swap.c,v 1.56 1998/07/04 22:30:26 julian Exp $ + * $Id: vm_swap.c,v 1.57 1998/10/25 19:24:04 bde Exp $ */ #include "opt_devfs.h" @@ -50,7 +50,7 @@ #include <sys/dmap.h> /* XXX */ #include <sys/vnode.h> #include <sys/fcntl.h> -#include <sys/rlist.h> +#include <sys/blist.h> #include <sys/kernel.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -94,8 +94,7 @@ static dev_t swapdev = makedev(BDEV_MAJOR, 0); static struct swdevt should_be_malloced[NSWAPDEV]; static struct swdevt *swdevt = should_be_malloced; struct vnode *swapdev_vp; -/* XXX swapinfo(8) needs this one I belive */ -int nswap; /* first block after the interleaved devs */ +static int nswap; /* first block after the interleaved devs */ static int nswdev = NSWAPDEV; int vm_swap_size; @@ -119,7 +118,13 @@ swstrategy(bp) register struct swdevt *sp; struct vnode *vp; - sz = howmany(bp->b_bcount, DEV_BSIZE); + sz = howmany(bp->b_bcount, PAGE_SIZE); + /* + * Convert interleaved swap into per-device swap. Note that + * the block size is left in PAGE_SIZE'd chunks (for the newswap) + * here. + */ + if (nswdev > 1) { off = bp->b_blkno % dmmax; if (off + sz > dmmax) { @@ -132,8 +137,9 @@ swstrategy(bp) index = seg % nswdev; seg /= nswdev; bp->b_blkno = seg * dmmax + off; - } else + } else { index = 0; + } sp = &swdevt[index]; if (bp->b_blkno + sz > sp->sw_nblks) { bp->b_error = EINVAL; @@ -148,6 +154,12 @@ swstrategy(bp) biodone(bp); return; } + + /* + * Convert from PAGE_SIZE'd to DEV_BSIZE'd chunks for the actual I/O + */ + bp->b_blkno = ctodb(bp->b_blkno); + vhold(sp->sw_vp); s = splvm(); if ((bp->b_flags & B_READ) == 0) { @@ -161,10 +173,8 @@ swstrategy(bp) } sp->sw_vp->v_numoutput++; } - if (bp->b_vp != NULL) - pbrelvp(bp); + pbreassignbuf(bp, sp->sw_vp); splx(s); - bp->b_vp = sp->sw_vp; VOP_STRATEGY(bp->b_vp, bp); } @@ -240,6 +250,11 @@ swapon(p, uap) * Each of the nswdev devices provides 1/nswdev'th of the swap * space, which is laid out with blocks of dmmax pages circularly * among the devices. + * + * The new swap code uses page-sized blocks. The old swap code used + * DEV_BSIZE'd chunks. + * + * XXX locking when multiple swapon's run in parallel */ int swaponvp(p, vp, dev, nblks) @@ -277,18 +292,37 @@ swaponvp(p, vp, dev, nblks) (void) VOP_CLOSE(vp, FREAD | FWRITE, p->p_ucred, p); return (ENXIO); } + /* + * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. + * First chop nblks off to page-align it, then convert. + * + * sw->sw_nblks is in page-sized chunks now too. + */ + nblks &= ~(ctodb(1) - 1); + nblks = dbtoc(nblks); + sp->sw_vp = vp; sp->sw_dev = dev; sp->sw_flags |= SW_FREED; sp->sw_nblks = nblks; + /* + * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not + * DEV_BSIZE'd. + */ + if (nblks * nswdev > nswap) nswap = (nblks+1) * nswdev; + if (swapblist == NULL) + swapblist = blist_create(nswap); + else + blist_resize(&swapblist, nswap, 0); + for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) { - blk = min(nblks - dvbase,dmmax); + blk = min(nblks - dvbase, dmmax); vsbase = index * dmmax + dvbase * nswdev; - rlist_free(&swaplist, vsbase, vsbase + blk - 1); + blist_free(swapblist, vsbase, blk); vm_swap_size += blk; } diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index fba7e2f..fe04da4 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -38,7 +38,7 @@ * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 - * $Id: vnode_pager.c,v 1.100 1998/10/13 08:24:44 dg Exp $ + * $Id: vnode_pager.c,v 1.101 1998/12/04 18:39:44 rvb Exp $ */ /* @@ -88,6 +88,8 @@ struct pagerops vnodepagerops = { NULL }; +int vnode_pbuf_freecnt = -1; /* start out unlimited */ + /* * Allocate (or lookup) pager for a vnode. @@ -106,6 +108,13 @@ vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, if (handle == NULL) return (NULL); + /* + * XXX hack - This initialization should be put somewhere else. + */ + if (vnode_pbuf_freecnt < 0) { + vnode_pbuf_freecnt = nswbuf / 2 + 1; + } + vp = (struct vnode *) handle; /* @@ -395,7 +404,7 @@ vnode_pager_input_smlfs(object, m) fileaddr = vnode_pager_addr(vp, IDX_TO_OFF(m->pindex) + i * bsize, (int *)0); if (fileaddr != -1) { - bp = getpbuf(); + bp = getpbuf(&vnode_pbuf_freecnt); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; @@ -428,7 +437,7 @@ vnode_pager_input_smlfs(object, m) /* * free the buffer header back to the swap buffer pool */ - relpbuf(bp); + relpbuf(bp, &vnode_pbuf_freecnt); if (error) break; @@ -707,7 +716,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) if (dp->v_type == VBLK || dp->v_type == VCHR) size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - bp = getpbuf(); + bp = getpbuf(&vnode_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; /* @@ -755,7 +764,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) /* * free the buffer header back to the swap buffer pool */ - relpbuf(bp); + relpbuf(bp, &vnode_pbuf_freecnt); for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) { vm_page_t mt; |