diff options
author | rgrimes <rgrimes@FreeBSD.org> | 1994-05-25 09:21:21 +0000 |
---|---|---|
committer | rgrimes <rgrimes@FreeBSD.org> | 1994-05-25 09:21:21 +0000 |
commit | 2469c867a164210ce96143517059f21db7f1fd17 (patch) | |
tree | 9179427ac860211c445df663fd2b86267366bfba /sys/vm | |
parent | cb0aba89af15a48e2655e898a503946ac4cb42ae (diff) | |
download | FreeBSD-src-2469c867a164210ce96143517059f21db7f1fd17.zip FreeBSD-src-2469c867a164210ce96143517059f21db7f1fd17.tar.gz |
The big 4.4BSD Lite to FreeBSD 2.0.0 (Development) patch.
Reviewed by: Rodney W. Grimes
Submitted by: John Dyson and David Greenman
Diffstat (limited to 'sys/vm')
-rw-r--r-- | sys/vm/device_pager.c | 72 | ||||
-rw-r--r-- | sys/vm/swap_pager.c | 2107 | ||||
-rw-r--r-- | sys/vm/swap_pager.h | 65 | ||||
-rw-r--r-- | sys/vm/vm.h | 4 | ||||
-rw-r--r-- | sys/vm/vm_extern.h | 12 | ||||
-rw-r--r-- | sys/vm/vm_fault.c | 440 | ||||
-rw-r--r-- | sys/vm/vm_glue.c | 500 | ||||
-rw-r--r-- | sys/vm/vm_init.c | 8 | ||||
-rw-r--r-- | sys/vm/vm_kern.c | 6 | ||||
-rw-r--r-- | sys/vm/vm_kern.h | 4 | ||||
-rw-r--r-- | sys/vm/vm_map.c | 127 | ||||
-rw-r--r-- | sys/vm/vm_map.h | 10 | ||||
-rw-r--r-- | sys/vm/vm_meter.c | 1 | ||||
-rw-r--r-- | sys/vm/vm_mmap.c | 4 | ||||
-rw-r--r-- | sys/vm/vm_object.c | 495 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 391 | ||||
-rw-r--r-- | sys/vm/vm_page.h | 30 | ||||
-rw-r--r-- | sys/vm/vm_pageout.c | 1063 | ||||
-rw-r--r-- | sys/vm/vm_pageout.h | 30 | ||||
-rw-r--r-- | sys/vm/vm_pager.c | 173 | ||||
-rw-r--r-- | sys/vm/vm_pager.h | 37 | ||||
-rw-r--r-- | sys/vm/vm_param.h | 30 | ||||
-rw-r--r-- | sys/vm/vm_prot.h | 2 | ||||
-rw-r--r-- | sys/vm/vm_swap.c | 16 | ||||
-rw-r--r-- | sys/vm/vm_unix.c | 41 | ||||
-rw-r--r-- | sys/vm/vm_user.c | 4 | ||||
-rw-r--r-- | sys/vm/vnode_pager.c | 1334 | ||||
-rw-r--r-- | sys/vm/vnode_pager.h | 3 |
28 files changed, 4909 insertions, 2100 deletions
diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index 235c917a..b8083df 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -35,7 +35,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)device_pager.c 8.5 (Berkeley) 1/12/94 + * @(#)device_pager.c 8.1 (Berkeley) 6/11/93 */ /* @@ -53,8 +53,8 @@ #include <vm/vm_page.h> #include <vm/device_pager.h> -struct pagerlst dev_pager_list; /* list of managed devices */ -struct pglist dev_pager_fakelist; /* list of available vm_page_t's */ +struct pagerlst dev_pager_list; /* list of managed devices */ +struct pglist dev_pager_fakelist; /* list of available vm_page_t's */ #ifdef DEBUG int dpagerdebug = 0; @@ -68,11 +68,11 @@ static vm_pager_t dev_pager_alloc __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); static void dev_pager_dealloc __P((vm_pager_t)); static int dev_pager_getpage - __P((vm_pager_t, vm_page_t *, int, boolean_t)); + __P((vm_pager_t, vm_page_t, boolean_t)); static boolean_t dev_pager_haspage __P((vm_pager_t, vm_offset_t)); static void dev_pager_init __P((void)); static int dev_pager_putpage - __P((vm_pager_t, vm_page_t *, int, boolean_t)); + __P((vm_pager_t, vm_page_t, boolean_t)); static vm_page_t dev_pager_getfake __P((vm_offset_t)); static void dev_pager_putfake __P((vm_page_t)); @@ -81,9 +81,10 @@ struct pagerops devicepagerops = { dev_pager_alloc, dev_pager_dealloc, dev_pager_getpage, + 0, dev_pager_putpage, - dev_pager_haspage, - vm_pager_clusternull + 0, + dev_pager_haspage }; static void @@ -109,7 +110,7 @@ dev_pager_alloc(handle, size, prot, foff) int (*mapfunc)(); vm_object_t object; dev_pager_t devp; - int npages, off; + unsigned int npages, off; #ifdef DEBUG if (dpagerdebug & DDB_FOLLOW) @@ -127,7 +128,7 @@ dev_pager_alloc(handle, size, prot, foff) /* * Make sure this device can be mapped. */ - dev = (dev_t)handle; + dev = (dev_t)(u_long)handle; mapfunc = cdevsw[major(dev)].d_mmap; if (mapfunc == NULL || mapfunc == enodev || mapfunc == nullop) return(NULL); @@ -135,7 +136,7 @@ dev_pager_alloc(handle, size, prot, foff) /* * Offset should be page aligned. */ - if (foff & PAGE_MASK) + if (foff & (PAGE_SIZE-1)) return(NULL); /* @@ -169,15 +170,15 @@ top: pager->pg_handle = handle; pager->pg_ops = &devicepagerops; pager->pg_type = PG_DEVICE; + pager->pg_data = (caddr_t)devp; pager->pg_flags = 0; - pager->pg_data = devp; TAILQ_INIT(&devp->devp_pglist); /* * Allocate object and associate it with the pager. */ object = devp->devp_object = vm_object_allocate(0); vm_object_enter(object, pager); - vm_object_setpager(object, pager, (vm_offset_t)0, FALSE); + vm_object_setpager(object, pager, (vm_offset_t)foff, FALSE); /* * Finally, put it on the managed list so other can find it. * First we re-lookup in case someone else beat us to this @@ -239,7 +240,7 @@ dev_pager_dealloc(pager) /* * Free up our fake pages. */ - while ((m = devp->devp_pglist.tqh_first) != NULL) { + while (m=devp->devp_pglist.tqh_first) { TAILQ_REMOVE(&devp->devp_pglist, m, pageq); dev_pager_putfake(m); } @@ -248,39 +249,33 @@ dev_pager_dealloc(pager) } static int -dev_pager_getpage(pager, mlist, npages, sync) +dev_pager_getpage(pager, m, sync) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t m; boolean_t sync; { register vm_object_t object; vm_offset_t offset, paddr; vm_page_t page; dev_t dev; + int s; int (*mapfunc)(), prot; - vm_page_t m; #ifdef DEBUG if (dpagerdebug & DDB_FOLLOW) - printf("dev_pager_getpage(%x, %x, %x, %x)\n", - pager, mlist, npages, sync); + printf("dev_pager_getpage(%x, %x)\n", pager, m); #endif - if (npages != 1) - panic("dev_pager_getpage: cannot handle multiple pages"); - m = *mlist; - object = m->object; - dev = (dev_t)pager->pg_handle; + dev = (dev_t)(u_long)pager->pg_handle; offset = m->offset + object->paging_offset; prot = PROT_READ; /* XXX should pass in? */ mapfunc = cdevsw[major(dev)].d_mmap; -#ifdef DIAGNOSTIC + if (mapfunc == NULL || mapfunc == enodev || mapfunc == nullop) panic("dev_pager_getpage: no map function"); -#endif - paddr = pmap_phys_address((*mapfunc)(dev, (int)offset, prot)); + + paddr = pmap_phys_address((*mapfunc)((dev_t)dev, (int)offset, prot)); #ifdef DIAGNOSTIC if (paddr == -1) panic("dev_pager_getpage: map function returns error"); @@ -290,13 +285,15 @@ dev_pager_getpage(pager, mlist, npages, sync) * up the original. */ page = dev_pager_getfake(paddr); - TAILQ_INSERT_TAIL(&((dev_pager_t)pager->pg_data)->devp_pglist, page, - pageq); + TAILQ_INSERT_TAIL(&((dev_pager_t)pager->pg_data)->devp_pglist, + page, pageq); vm_object_lock(object); vm_page_lock_queues(); vm_page_free(m); - vm_page_insert(page, object, offset); vm_page_unlock_queues(); + s = splhigh(); + vm_page_insert(page, object, offset); + splx(s); PAGE_WAKEUP(m); if (offset + PAGE_SIZE > object->size) object->size = offset + PAGE_SIZE; /* XXX anal */ @@ -306,19 +303,17 @@ dev_pager_getpage(pager, mlist, npages, sync) } static int -dev_pager_putpage(pager, mlist, npages, sync) +dev_pager_putpage(pager, m, sync) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t m; boolean_t sync; { #ifdef DEBUG if (dpagerdebug & DDB_FOLLOW) - printf("dev_pager_putpage(%x, %x, %x, %x)\n", - pager, mlist, npages, sync); + printf("dev_pager_putpage(%x, %x)\n", pager, m); #endif if (pager == NULL) - return; + return 0; panic("dev_pager_putpage called"); } @@ -350,9 +345,12 @@ dev_pager_getfake(paddr) } m = dev_pager_fakelist.tqh_first; TAILQ_REMOVE(&dev_pager_fakelist, m, pageq); + m->flags = PG_BUSY | PG_CLEAN | PG_FAKE | PG_FICTITIOUS; - m->phys_addr = paddr; + m->wire_count = 1; + m->phys_addr = paddr; + return(m); } diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 899a6cf..5a1efae 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. @@ -51,179 +52,145 @@ #include <sys/systm.h> #include <sys/proc.h> #include <sys/buf.h> -#include <sys/map.h> #include <sys/vnode.h> #include <sys/malloc.h> #include <miscfs/specfs/specdev.h> +#include <sys/rlist.h> #include <vm/vm.h> +#include <vm/vm_pager.h> #include <vm/vm_page.h> #include <vm/vm_pageout.h> #include <vm/swap_pager.h> -#define NSWSIZES 16 /* size of swtab */ -#define MAXDADDRS 64 /* max # of disk addrs for fixed allocations */ #ifndef NPENDINGIO -#define NPENDINGIO 64 /* max # of pending cleans */ +#define NPENDINGIO 16 #endif -#ifdef DEBUG -int swpagerdebug = 0x100; -#define SDB_FOLLOW 0x001 -#define SDB_INIT 0x002 -#define SDB_ALLOC 0x004 -#define SDB_IO 0x008 -#define SDB_WRITE 0x010 -#define SDB_FAIL 0x020 -#define SDB_ALLOCBLK 0x040 -#define SDB_FULL 0x080 -#define SDB_ANOM 0x100 -#define SDB_ANOMPANIC 0x200 -#define SDB_CLUSTER 0x400 -#define SDB_PARANOIA 0x800 -#endif +extern int nswbuf; +int nswiodone; +extern int vm_pageout_rate_limit; +static int cleandone; +extern int hz; +int swap_pager_full; +extern vm_map_t pager_map; +extern int vm_pageout_pages_needed; +extern int vm_swap_size; +extern struct vnode *swapdev_vp; + +#define MAX_PAGEOUT_CLUSTER 8 TAILQ_HEAD(swpclean, swpagerclean); +typedef struct swpagerclean *swp_clean_t; + struct swpagerclean { TAILQ_ENTRY(swpagerclean) spc_list; int spc_flags; struct buf *spc_bp; sw_pager_t spc_swp; vm_offset_t spc_kva; - vm_page_t spc_m; - int spc_npages; -} swcleanlist[NPENDINGIO]; -typedef struct swpagerclean *swp_clean_t; - -/* spc_flags values */ -#define SPC_FREE 0x00 -#define SPC_BUSY 0x01 -#define SPC_DONE 0x02 -#define SPC_ERROR 0x04 - -struct swtab { - vm_size_t st_osize; /* size of object (bytes) */ - int st_bsize; /* vs. size of swap block (DEV_BSIZE units) */ -#ifdef DEBUG - u_long st_inuse; /* number in this range in use */ - u_long st_usecnt; /* total used of this size */ -#endif -} swtab[NSWSIZES+1]; + vm_offset_t spc_altkva; + int spc_count; + vm_page_t spc_m[MAX_PAGEOUT_CLUSTER]; +} swcleanlist [NPENDINGIO] ; -#ifdef DEBUG -int swap_pager_poip; /* pageouts in progress */ -int swap_pager_piip; /* pageins in progress */ -#endif -int swap_pager_maxcluster; /* maximum cluster size */ -int swap_pager_npendingio; /* number of pager clean structs */ +extern vm_map_t kernel_map; -struct swpclean swap_pager_inuse; /* list of pending page cleans */ -struct swpclean swap_pager_free; /* list of free pager clean structs */ -struct pagerlst swap_pager_list; /* list of "named" anon regions */ +/* spc_flags values */ +#define SPC_ERROR 0x01 + +#define SWB_EMPTY (-1) + +void swap_pager_init(void); +vm_pager_t swap_pager_alloc(caddr_t, vm_size_t, vm_prot_t, vm_offset_t); +void swap_pager_dealloc(vm_pager_t); +boolean_t swap_pager_getpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_putpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_getmulti(vm_pager_t, vm_page_t *, int, int, boolean_t); +boolean_t swap_pager_haspage(vm_pager_t, vm_offset_t); +int swap_pager_io(sw_pager_t, vm_page_t *, int, int, int); +void swap_pager_iodone(struct buf *); +boolean_t swap_pager_clean(); + +extern struct pagerops swappagerops; + +struct swpclean swap_pager_done; /* list of compileted page cleans */ +struct swpclean swap_pager_inuse; /* list of pending page cleans */ +struct swpclean swap_pager_free; /* list of free pager clean structs */ +struct pagerlst swap_pager_list; /* list of "named" anon regions */ +struct pagerlst swap_pager_un_list; /* list of "unnamed" anon pagers */ + +#define SWAP_FREE_NEEDED 0x1 /* need a swap block */ +int swap_pager_needflags; +struct rlist *swapfrag; + +struct pagerlst *swp_qs[]={ + &swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0 +}; -static void swap_pager_init __P((void)); -static vm_pager_t swap_pager_alloc - __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); -static void swap_pager_clean __P((int)); -#ifdef DEBUG -static void swap_pager_clean_check __P((vm_page_t *, int, int)); -#endif -static void swap_pager_cluster - __P((vm_pager_t, vm_offset_t, - vm_offset_t *, vm_offset_t *)); -static void swap_pager_dealloc __P((vm_pager_t)); -static int swap_pager_getpage - __P((vm_pager_t, vm_page_t *, int, boolean_t)); -static boolean_t swap_pager_haspage __P((vm_pager_t, vm_offset_t)); -static int swap_pager_io __P((sw_pager_t, vm_page_t *, int, int)); -static void swap_pager_iodone __P((struct buf *)); -static int swap_pager_putpage - __P((vm_pager_t, vm_page_t *, int, boolean_t)); +int swap_pager_putmulti(); struct pagerops swappagerops = { swap_pager_init, swap_pager_alloc, swap_pager_dealloc, swap_pager_getpage, + swap_pager_getmulti, swap_pager_putpage, - swap_pager_haspage, - swap_pager_cluster + swap_pager_putmulti, + swap_pager_haspage }; -static void +extern int nswbuf; + +int npendingio = NPENDINGIO; +int pendingiowait; +int require_swap_init; +void swap_pager_finish(); +int dmmin, dmmax; +extern int vm_page_count; + +struct buf * getpbuf() ; +void relpbuf(struct buf *bp) ; + +static inline void swapsizecheck() { + if( vm_swap_size < 128*btodb(PAGE_SIZE)) { + if( swap_pager_full) + printf("swap_pager: out of space\n"); + swap_pager_full = 1; + } else if( vm_swap_size > 192*btodb(PAGE_SIZE)) + swap_pager_full = 0; +} + +void swap_pager_init() { - register swp_clean_t spc; - register int i, bsize; extern int dmmin, dmmax; - int maxbsize; -#ifdef DEBUG - if (swpagerdebug & (SDB_FOLLOW|SDB_INIT)) - printf("swpg_init()\n"); -#endif dfltpagerops = &swappagerops; - TAILQ_INIT(&swap_pager_list); - /* - * Allocate async IO structures. - * - * XXX it would be nice if we could do this dynamically based on - * the value of nswbuf (since we are ultimately limited by that) - * but neither nswbuf or malloc has been initialized yet. So the - * structs are statically allocated above. - */ - swap_pager_npendingio = NPENDINGIO; + TAILQ_INIT(&swap_pager_list); + TAILQ_INIT(&swap_pager_un_list); /* * Initialize clean lists */ TAILQ_INIT(&swap_pager_inuse); + TAILQ_INIT(&swap_pager_done); TAILQ_INIT(&swap_pager_free); - for (i = 0, spc = swcleanlist; i < swap_pager_npendingio; i++, spc++) { - TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); - spc->spc_flags = SPC_FREE; - } + + require_swap_init = 1; /* * Calculate the swap allocation constants. */ - if (dmmin == 0) { - dmmin = DMMIN; - if (dmmin < CLBYTES/DEV_BSIZE) - dmmin = CLBYTES/DEV_BSIZE; - } - if (dmmax == 0) - dmmax = DMMAX; - - /* - * Fill in our table of object size vs. allocation size - */ - bsize = btodb(PAGE_SIZE); - if (bsize < dmmin) - bsize = dmmin; - maxbsize = btodb(sizeof(sw_bm_t) * NBBY * PAGE_SIZE); - if (maxbsize > dmmax) - maxbsize = dmmax; - for (i = 0; i < NSWSIZES; i++) { - swtab[i].st_osize = (vm_size_t) (MAXDADDRS * dbtob(bsize)); - swtab[i].st_bsize = bsize; - if (bsize <= btodb(MAXPHYS)) - swap_pager_maxcluster = dbtob(bsize); -#ifdef DEBUG - if (swpagerdebug & SDB_INIT) - printf("swpg_init: ix %d, size %x, bsize %x\n", - i, swtab[i].st_osize, swtab[i].st_bsize); -#endif - if (bsize >= maxbsize) - break; - bsize *= 2; - } - swtab[i].st_osize = 0; - swtab[i].st_bsize = bsize; + + dmmin = CLBYTES/DEV_BSIZE; + dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2; + } /* @@ -231,22 +198,43 @@ swap_pager_init() * Note that if we are called from the pageout daemon (handle == NULL) * we should not wait for memory as it could resulting in deadlock. */ -static vm_pager_t -swap_pager_alloc(handle, size, prot, foff) +vm_pager_t +swap_pager_alloc(handle, size, prot, offset) caddr_t handle; register vm_size_t size; vm_prot_t prot; - vm_offset_t foff; + vm_offset_t offset; { register vm_pager_t pager; register sw_pager_t swp; - struct swtab *swt; int waitok; - -#ifdef DEBUG - if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC)) - printf("swpg_alloc(%x, %x, %x)\n", handle, size, prot); -#endif + int i,j; + + if (require_swap_init) { + swp_clean_t spc; + struct buf *bp; + /* + * kva's are allocated here so that we dont need to keep + * doing kmem_alloc pageables at runtime + */ + for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) { + spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE); + if (!spc->spc_kva) { + break; + } + spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT); + if (!spc->spc_bp) { + kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE); + break; + } + spc->spc_flags = 0; + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + } + require_swap_init = 0; + if( size == 0) + return(NULL); + } + /* * If this is a "named" anonymous region, look it up and * return the appropriate pager if it exists. @@ -264,50 +252,43 @@ swap_pager_alloc(handle, size, prot, foff) return(pager); } } + + if (swap_pager_full) { + return(NULL); + } + /* * Pager doesn't exist, allocate swap management resources * and initialize. */ - waitok = handle ? M_WAITOK : M_NOWAIT; + waitok = handle ? M_WAITOK : M_NOWAIT; pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok); if (pager == NULL) return(NULL); swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok); if (swp == NULL) { -#ifdef DEBUG - if (swpagerdebug & SDB_FAIL) - printf("swpg_alloc: swpager malloc failed\n"); -#endif free((caddr_t)pager, M_VMPAGER); return(NULL); } size = round_page(size); - for (swt = swtab; swt->st_osize; swt++) - if (size <= swt->st_osize) - break; -#ifdef DEBUG - swt->st_inuse++; - swt->st_usecnt++; -#endif swp->sw_osize = size; - swp->sw_bsize = swt->st_bsize; - swp->sw_nblocks = (btodb(size) + swp->sw_bsize - 1) / swp->sw_bsize; + swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE); swp->sw_blocks = (sw_blk_t) malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks), - M_VMPGDATA, M_NOWAIT); + M_VMPGDATA, waitok); if (swp->sw_blocks == NULL) { free((caddr_t)swp, M_VMPGDATA); free((caddr_t)pager, M_VMPAGER); -#ifdef DEBUG - if (swpagerdebug & SDB_FAIL) - printf("swpg_alloc: sw_blocks malloc failed\n"); - swt->st_inuse--; - swt->st_usecnt--; -#endif - return(FALSE); + return(NULL); + } + + for (i = 0; i < swp->sw_nblocks; i++) { + swp->sw_blocks[i].swb_valid = 0; + swp->sw_blocks[i].swb_locked = 0; + for (j = 0; j < SWB_NPAGES; j++) + swp->sw_blocks[i].swb_block[j] = SWB_EMPTY; } - bzero((caddr_t)swp->sw_blocks, - swp->sw_nblocks * sizeof(*swp->sw_blocks)); + swp->sw_poip = 0; if (handle) { vm_object_t object; @@ -324,686 +305,1530 @@ swap_pager_alloc(handle, size, prot, foff) vm_object_setpager(object, pager, 0, FALSE); } else { swp->sw_flags = 0; - pager->pg_list.tqe_next = NULL; - pager->pg_list.tqe_prev = NULL; + TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list); } pager->pg_handle = handle; pager->pg_ops = &swappagerops; pager->pg_type = PG_SWAP; - pager->pg_flags = PG_CLUSTERPUT; - pager->pg_data = swp; + pager->pg_data = (caddr_t)swp; -#ifdef DEBUG - if (swpagerdebug & SDB_ALLOC) - printf("swpg_alloc: pg_data %x, %x of %x at %x\n", - swp, swp->sw_nblocks, swp->sw_bsize, swp->sw_blocks); -#endif return(pager); } +/* + * returns disk block associated with pager and offset + * additionally, as a side effect returns a flag indicating + * if the block has been written + */ + +static int * +swap_pager_diskaddr(swp, offset, valid) + sw_pager_t swp; + vm_offset_t offset; + int *valid; +{ + register sw_blk_t swb; + int ix; + + if (valid) + *valid = 0; + ix = offset / (SWB_NPAGES*PAGE_SIZE); + if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { + return(FALSE); + } + swb = &swp->sw_blocks[ix]; + ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; + if (valid) + *valid = swb->swb_valid & (1<<ix); + return &swb->swb_block[ix]; +} + +/* + * Utility routine to set the valid (written) bit for + * a block associated with a pager and offset + */ static void +swap_pager_setvalid(swp, offset, valid) + sw_pager_t swp; + vm_offset_t offset; + int valid; +{ + register sw_blk_t swb; + int ix; + + ix = offset / (SWB_NPAGES*PAGE_SIZE); + if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) + return; + + swb = &swp->sw_blocks[ix]; + ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; + if (valid) + swb->swb_valid |= (1 << ix); + else + swb->swb_valid &= ~(1 << ix); + return; +} + +/* + * this routine allocates swap space with a fragmentation + * minimization policy. + */ +int +swap_pager_getswapspace( unsigned amount, unsigned *rtval) { + unsigned tmpalloc; + unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE); + if( amount < nblocksfrag) { + if( rlist_alloc(&swapfrag, amount, rtval)) + return 1; + if( !rlist_alloc(&swapmap, nblocksfrag, &tmpalloc)) + return 0; + rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1); + *rtval = tmpalloc; + return 1; + } + if( !rlist_alloc(&swapmap, amount, rtval)) + return 0; + else + return 1; +} + +/* + * this routine frees swap space with a fragmentation + * minimization policy. + */ +void +swap_pager_freeswapspace( unsigned from, unsigned to) { + unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE); + unsigned tmpalloc; + if( ((to + 1) - from) >= nblocksfrag) { + while( (from + nblocksfrag) <= to + 1) { + rlist_free(&swapmap, from, from + nblocksfrag - 1); + from += nblocksfrag; + } + } + if( from >= to) + return; + rlist_free(&swapfrag, from, to); + while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) { + rlist_free(&swapmap, tmpalloc, tmpalloc + nblocksfrag-1); + } +} +/* + * this routine frees swap blocks from a specified pager + */ +void +_swap_pager_freespace(swp, start, size) + sw_pager_t swp; + vm_offset_t start; + vm_offset_t size; +{ + vm_offset_t i; + int s; + + s = splbio(); + for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) { + int valid; + int *addr = swap_pager_diskaddr(swp, i, &valid); + if (addr && *addr != SWB_EMPTY) { + swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1); + if( valid) { + vm_swap_size += btodb(PAGE_SIZE); + swap_pager_setvalid(swp, i, 0); + } + *addr = SWB_EMPTY; + } + } + swapsizecheck(); + splx(s); +} + +void +swap_pager_freespace(pager, start, size) + vm_pager_t pager; + vm_offset_t start; + vm_offset_t size; +{ + _swap_pager_freespace((sw_pager_t) pager->pg_data, start, size); +} + +/* + * swap_pager_reclaim frees up over-allocated space from all pagers + * this eliminates internal fragmentation due to allocation of space + * for segments that are never swapped to. It has been written so that + * it does not block until the rlist_free operation occurs; it keeps + * the queues consistant. + */ + +/* + * Maximum number of blocks (pages) to reclaim per pass + */ +#define MAXRECLAIM 256 + +void +swap_pager_reclaim() +{ + vm_pager_t p; + sw_pager_t swp; + int i, j, k; + int s; + int reclaimcount; + static int reclaims[MAXRECLAIM]; + static int in_reclaim; + +/* + * allow only one process to be in the swap_pager_reclaim subroutine + */ + s = splbio(); + if (in_reclaim) { + tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0); + splx(s); + return; + } + in_reclaim = 1; + reclaimcount = 0; + + /* for each pager queue */ + for (k = 0; swp_qs[k]; k++) { + + p = swp_qs[k]->tqh_first; + while (p && (reclaimcount < MAXRECLAIM)) { + + /* + * see if any blocks associated with a pager has been + * allocated but not used (written) + */ + swp = (sw_pager_t) p->pg_data; + for (i = 0; i < swp->sw_nblocks; i++) { + sw_blk_t swb = &swp->sw_blocks[i]; + if( swb->swb_locked) + continue; + for (j = 0; j < SWB_NPAGES; j++) { + if (swb->swb_block[j] != SWB_EMPTY && + (swb->swb_valid & (1 << j)) == 0) { + reclaims[reclaimcount++] = swb->swb_block[j]; + swb->swb_block[j] = SWB_EMPTY; + if (reclaimcount >= MAXRECLAIM) + goto rfinished; + } + } + } + p = p->pg_list.tqe_next; + } + } + +rfinished: + +/* + * free the blocks that have been added to the reclaim list + */ + for (i = 0; i < reclaimcount; i++) { + swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1); + swapsizecheck(); + wakeup((caddr_t) &in_reclaim); + } + + splx(s); + in_reclaim = 0; + wakeup((caddr_t) &in_reclaim); +} + + +/* + * swap_pager_copy copies blocks from one pager to another and + * destroys the source pager + */ + +void +swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset) + vm_pager_t srcpager; + vm_offset_t srcoffset; + vm_pager_t dstpager; + vm_offset_t dstoffset; + vm_offset_t offset; +{ + sw_pager_t srcswp, dstswp; + vm_offset_t i; + int s; + + srcswp = (sw_pager_t) srcpager->pg_data; + dstswp = (sw_pager_t) dstpager->pg_data; + +/* + * remove the source pager from the swap_pager internal queue + */ + s = splbio(); + if (srcswp->sw_flags & SW_NAMED) { + TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list); + srcswp->sw_flags &= ~SW_NAMED; + } else { + TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list); + } + + while (srcswp->sw_poip) { + tsleep((caddr_t)srcswp, PVM, "spgout", 0); + } + splx(s); + +/* + * clean all of the pages that are currently active and finished + */ + (void) swap_pager_clean(); + + s = splbio(); +/* + * clear source block before destination object + * (release allocated space) + */ + for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) { + int valid; + int *addr = swap_pager_diskaddr(srcswp, i, &valid); + if (addr && *addr != SWB_EMPTY) { + swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1); + if( valid) + vm_swap_size += btodb(PAGE_SIZE); + swapsizecheck(); + *addr = SWB_EMPTY; + } + } +/* + * transfer source to destination + */ + for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) { + int srcvalid, dstvalid; + int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset, + &srcvalid); + int *dstaddrp; + /* + * see if the source has space allocated + */ + if (srcaddrp && *srcaddrp != SWB_EMPTY) { + /* + * if the source is valid and the dest has no space, then + * copy the allocation from the srouce to the dest. + */ + if (srcvalid) { + dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid); + /* + * if the dest already has a valid block, deallocate the + * source block without copying. + */ + if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) { + swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1); + *dstaddrp = SWB_EMPTY; + } + if (dstaddrp && *dstaddrp == SWB_EMPTY) { + *dstaddrp = *srcaddrp; + *srcaddrp = SWB_EMPTY; + swap_pager_setvalid(dstswp, i + dstoffset, 1); + vm_swap_size -= btodb(PAGE_SIZE); + } + } + /* + * if the source is not empty at this point, then deallocate the space. + */ + if (*srcaddrp != SWB_EMPTY) { + swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1); + if( srcvalid) + vm_swap_size += btodb(PAGE_SIZE); + *srcaddrp = SWB_EMPTY; + } + } + } + +/* + * deallocate the rest of the source object + */ + for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) { + int valid; + int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid); + if (srcaddrp && *srcaddrp != SWB_EMPTY) { + swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1); + if( valid) + vm_swap_size += btodb(PAGE_SIZE); + *srcaddrp = SWB_EMPTY; + } + } + + swapsizecheck(); + splx(s); + + free((caddr_t)srcswp->sw_blocks, M_VMPGDATA); + srcswp->sw_blocks = 0; + free((caddr_t)srcswp, M_VMPGDATA); + srcpager->pg_data = 0; + free((caddr_t)srcpager, M_VMPAGER); + + return; +} + + +void swap_pager_dealloc(pager) vm_pager_t pager; { - register int i; + register int i,j; register sw_blk_t bp; register sw_pager_t swp; - struct swtab *swt; int s; -#ifdef DEBUG - /* save panic time state */ - if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) - return; - if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC)) - printf("swpg_dealloc(%x)\n", pager); -#endif /* * Remove from list right away so lookups will fail if we * block for pageout completion. */ + s = splbio(); swp = (sw_pager_t) pager->pg_data; if (swp->sw_flags & SW_NAMED) { TAILQ_REMOVE(&swap_pager_list, pager, pg_list); swp->sw_flags &= ~SW_NAMED; + } else { + TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list); } -#ifdef DEBUG - for (swt = swtab; swt->st_osize; swt++) - if (swp->sw_osize <= swt->st_osize) - break; - swt->st_inuse--; -#endif - /* * Wait for all pageouts to finish and remove * all entries from cleaning list. */ - s = splbio(); + while (swp->sw_poip) { - swp->sw_flags |= SW_WANTED; - (void) tsleep(swp, PVM, "swpgdealloc", 0); + tsleep((caddr_t)swp, PVM, "swpout", 0); } splx(s); - swap_pager_clean(B_WRITE); + + + (void) swap_pager_clean(); /* * Free left over swap blocks */ - for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) - if (bp->swb_block) { -#ifdef DEBUG - if (swpagerdebug & (SDB_ALLOCBLK|SDB_FULL)) - printf("swpg_dealloc: blk %x\n", - bp->swb_block); -#endif - rmfree(swapmap, swp->sw_bsize, bp->swb_block); + s = splbio(); + for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) { + for (j = 0; j < SWB_NPAGES; j++) + if (bp->swb_block[j] != SWB_EMPTY) { + swap_pager_freeswapspace((unsigned)bp->swb_block[j], + (unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1); + if( bp->swb_valid & (1<<j)) + vm_swap_size += btodb(PAGE_SIZE); + bp->swb_block[j] = SWB_EMPTY; } + } + splx(s); + swapsizecheck(); + /* * Free swap management resources */ free((caddr_t)swp->sw_blocks, M_VMPGDATA); + swp->sw_blocks = 0; free((caddr_t)swp, M_VMPGDATA); + pager->pg_data = 0; free((caddr_t)pager, M_VMPAGER); } -static int -swap_pager_getpage(pager, mlist, npages, sync) +/* + * swap_pager_getmulti can get multiple pages. + */ +int +swap_pager_getmulti(pager, m, count, reqpage, sync) + vm_pager_t pager; + vm_page_t *m; + int count; + int reqpage; + boolean_t sync; +{ + if( reqpage >= count) + panic("swap_pager_getmulti: reqpage >= count\n"); + return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage); +} + +/* + * swap_pager_getpage gets individual pages + */ +int +swap_pager_getpage(pager, m, sync) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t m; boolean_t sync; { -#ifdef DEBUG - if (swpagerdebug & SDB_FOLLOW) - printf("swpg_getpage(%x, %x, %x, %x)\n", - pager, mlist, npages, sync); -#endif - return(swap_pager_io((sw_pager_t)pager->pg_data, - mlist, npages, B_READ)); + vm_page_t marray[1]; + + marray[0] = m; + return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0); } -static int -swap_pager_putpage(pager, mlist, npages, sync) +int +swap_pager_putmulti(pager, m, c, sync, rtvals) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t *m; + int c; boolean_t sync; + int *rtvals; { int flags; -#ifdef DEBUG - if (swpagerdebug & SDB_FOLLOW) - printf("swpg_putpage(%x, %x, %x, %x)\n", - pager, mlist, npages, sync); -#endif if (pager == NULL) { - swap_pager_clean(B_WRITE); - return (VM_PAGER_OK); /* ??? */ + (void) swap_pager_clean(); + return VM_PAGER_OK; } + flags = B_WRITE; if (!sync) flags |= B_ASYNC; - return(swap_pager_io((sw_pager_t)pager->pg_data, - mlist, npages, flags)); + + return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals); } -static boolean_t -swap_pager_haspage(pager, offset) +/* + * swap_pager_putpage writes individual pages + */ +int +swap_pager_putpage(pager, m, sync) vm_pager_t pager; + vm_page_t m; + boolean_t sync; +{ + int flags; + vm_page_t marray[1]; + int rtvals[1]; + + + if (pager == NULL) { + (void) swap_pager_clean(); + return VM_PAGER_OK; + } + + marray[0] = m; + flags = B_WRITE; + if (!sync) + flags |= B_ASYNC; + + swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals); + + return rtvals[0]; +} + +static inline int +const swap_pager_block_index(swp, offset) + sw_pager_t swp; + vm_offset_t offset; +{ + return (offset / (SWB_NPAGES*PAGE_SIZE)); +} + +static inline int +const swap_pager_block_offset(swp, offset) + sw_pager_t swp; + vm_offset_t offset; +{ + return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE); +} + +/* + * _swap_pager_haspage returns TRUE if the pager has data that has + * been written out. + */ +static boolean_t +_swap_pager_haspage(swp, offset) + sw_pager_t swp; vm_offset_t offset; { - register sw_pager_t swp; register sw_blk_t swb; int ix; -#ifdef DEBUG - if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK)) - printf("swpg_haspage(%x, %x) ", pager, offset); -#endif - swp = (sw_pager_t) pager->pg_data; - ix = offset / dbtob(swp->sw_bsize); + ix = offset / (SWB_NPAGES*PAGE_SIZE); if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { -#ifdef DEBUG - if (swpagerdebug & (SDB_FAIL|SDB_FOLLOW|SDB_ALLOCBLK)) - printf("swpg_haspage: %x bad offset %x, ix %x\n", - swp->sw_blocks, offset, ix); -#endif return(FALSE); } swb = &swp->sw_blocks[ix]; - if (swb->swb_block) - ix = atop(offset % dbtob(swp->sw_bsize)); -#ifdef DEBUG - if (swpagerdebug & SDB_ALLOCBLK) - printf("%x blk %x+%x ", swp->sw_blocks, swb->swb_block, ix); - if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK)) - printf("-> %c\n", - "FT"[swb->swb_block && (swb->swb_mask & (1 << ix))]); -#endif - if (swb->swb_block && (swb->swb_mask & (1 << ix))) - return(TRUE); + ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; + if (swb->swb_block[ix] != SWB_EMPTY) { + if (swb->swb_valid & (1 << ix)) + return TRUE; + } + return(FALSE); } +/* + * swap_pager_haspage is the externally accessible version of + * _swap_pager_haspage above. this routine takes a vm_pager_t + * for an argument instead of sw_pager_t. + */ +boolean_t +swap_pager_haspage(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ + return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset); +} + +/* + * swap_pager_freepage is a convienience routine that clears the busy + * bit and deallocates a page. + */ static void -swap_pager_cluster(pager, offset, loffset, hoffset) - vm_pager_t pager; - vm_offset_t offset; - vm_offset_t *loffset; - vm_offset_t *hoffset; +swap_pager_freepage(m) + vm_page_t m; { - sw_pager_t swp; - register int bsize; - vm_offset_t loff, hoff; + PAGE_WAKEUP(m); + vm_page_free(m); +} -#ifdef DEBUG - if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER)) - printf("swpg_cluster(%x, %x) ", pager, offset); -#endif - swp = (sw_pager_t) pager->pg_data; - bsize = dbtob(swp->sw_bsize); - if (bsize > swap_pager_maxcluster) - bsize = swap_pager_maxcluster; - - loff = offset - (offset % bsize); - if (loff >= swp->sw_osize) - panic("swap_pager_cluster: bad offset"); - - hoff = loff + bsize; - if (hoff > swp->sw_osize) - hoff = swp->sw_osize; - - *loffset = loff; - *hoffset = hoff; -#ifdef DEBUG - if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER)) - printf("returns [%x-%x]\n", loff, hoff); -#endif +/* + * swap_pager_ridpages is a convienience routine that deallocates all + * but the required page. this is usually used in error returns that + * need to invalidate the "extra" readahead pages. + */ +static void +swap_pager_ridpages(m, count, reqpage) + vm_page_t *m; + int count; + int reqpage; +{ + int i; + for (i = 0; i < count; i++) + if (i != reqpage) + swap_pager_freepage(m[i]); } +int swapwritecount=0; + /* - * Scaled down version of swap(). - * Assumes that PAGE_SIZE < MAXPHYS; i.e. only one operation needed. - * BOGUS: lower level IO routines expect a KVA so we have to map our - * provided physical page into the KVA to keep them happy. + * swap_pager_iodone1 is the completion routine for both reads and async writes */ -static int -swap_pager_io(swp, mlist, npages, flags) +void +swap_pager_iodone1(bp) + struct buf *bp; +{ + bp->b_flags |= B_DONE; + bp->b_flags &= ~B_ASYNC; + wakeup((caddr_t)bp); +/* + if ((bp->b_flags & B_READ) == 0) + vwakeup(bp); +*/ +} + + +int +swap_pager_input(swp, m, count, reqpage) register sw_pager_t swp; - vm_page_t *mlist; - int npages; - int flags; + vm_page_t *m; + int count, reqpage; { register struct buf *bp; - register sw_blk_t swb; + sw_blk_t swb[count]; register int s; - int ix, mask; + int i; boolean_t rv; - vm_offset_t kva, off; + vm_offset_t kva, off[count]; swp_clean_t spc; - vm_page_t m; + vm_offset_t paging_offset; + vm_object_t object; + int reqaddr[count]; -#ifdef DEBUG - /* save panic time state */ - if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) - return (VM_PAGER_FAIL); /* XXX: correct return? */ - if (swpagerdebug & (SDB_FOLLOW|SDB_IO)) - printf("swpg_io(%x, %x, %x, %x)\n", swp, mlist, npages, flags); - if (flags & B_READ) { - if (flags & B_ASYNC) - panic("swap_pager_io: cannot do ASYNC reads"); - if (npages != 1) - panic("swap_pager_io: cannot do clustered reads"); - } -#endif + int first, last; + int failed; + int reqdskregion; + object = m[reqpage]->object; + paging_offset = object->paging_offset; /* * First determine if the page exists in the pager if this is * a sync read. This quickly handles cases where we are * following shadow chains looking for the top level object * with the page. */ - m = *mlist; - off = m->offset + m->object->paging_offset; - ix = off / dbtob(swp->sw_bsize); - if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { -#ifdef DEBUG - if ((flags & B_READ) == 0 && (swpagerdebug & SDB_ANOM)) { - printf("swap_pager_io: no swap block on write\n"); - return(VM_PAGER_BAD); - } -#endif + if (swp->sw_blocks == NULL) { + swap_pager_ridpages(m, count, reqpage); return(VM_PAGER_FAIL); } - swb = &swp->sw_blocks[ix]; - off = off % dbtob(swp->sw_bsize); - if ((flags & B_READ) && - (swb->swb_block == 0 || (swb->swb_mask & (1 << atop(off))) == 0)) + + for(i = 0; i < count; i++) { + vm_offset_t foff = m[i]->offset + paging_offset; + int ix = swap_pager_block_index(swp, foff); + if (ix >= swp->sw_nblocks) { + int j; + if( i <= reqpage) { + swap_pager_ridpages(m, count, reqpage); + return(VM_PAGER_FAIL); + } + for(j = i; j < count; j++) { + swap_pager_freepage(m[j]); + } + count = i; + break; + } + + swb[i] = &swp->sw_blocks[ix]; + off[i] = swap_pager_block_offset(swp, foff); + reqaddr[i] = swb[i]->swb_block[off[i]]; + } + + /* make sure that our required input request is existant */ + + if (reqaddr[reqpage] == SWB_EMPTY || + (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) { + swap_pager_ridpages(m, count, reqpage); return(VM_PAGER_FAIL); + } + + + reqdskregion = reqaddr[reqpage] / dmmax; /* - * For reads (pageins) and synchronous writes, we clean up - * all completed async pageouts. + * search backwards for the first contiguous page to transfer */ - if ((flags & B_ASYNC) == 0) { - s = splbio(); - swap_pager_clean(flags&B_READ); -#ifdef DEBUG - if (swpagerdebug & SDB_PARANOIA) - swap_pager_clean_check(mlist, npages, flags&B_READ); -#endif - splx(s); + failed = 0; + first = 0; + for (i = reqpage - 1; i >= 0; --i) { + if ( failed || (reqaddr[i] == SWB_EMPTY) || + (swb[i]->swb_valid & (1 << off[i])) == 0 || + (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || + ((reqaddr[i] / dmmax) != reqdskregion)) { + failed = 1; + swap_pager_freepage(m[i]); + if (first == 0) + first = i + 1; + } } /* - * For async writes (pageouts), we cleanup completed pageouts so - * that all available resources are freed. Also tells us if this - * page is already being cleaned. If it is, or no resources - * are available, we try again later. + * search forwards for the last contiguous page to transfer */ - else { - swap_pager_clean(B_WRITE); -#ifdef DEBUG - if (swpagerdebug & SDB_PARANOIA) - swap_pager_clean_check(mlist, npages, B_WRITE); -#endif - if (swap_pager_free.tqh_first == NULL) { -#ifdef DEBUG - if (swpagerdebug & SDB_FAIL) - printf("%s: no available io headers\n", - "swap_pager_io"); -#endif - return(VM_PAGER_AGAIN); + failed = 0; + last = count; + for (i = reqpage + 1; i < count; i++) { + if ( failed || (reqaddr[i] == SWB_EMPTY) || + (swb[i]->swb_valid & (1 << off[i])) == 0 || + (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || + ((reqaddr[i] / dmmax) != reqdskregion)) { + failed = 1; + swap_pager_freepage(m[i]); + if (last == count) + last = i; + } + } + + count = last; + if (first != 0) { + for (i = first; i < count; i++) { + m[i-first] = m[i]; + reqaddr[i-first] = reqaddr[i]; + off[i-first] = off[i]; } + count -= first; + reqpage -= first; } + ++swb[reqpage]->swb_locked; + /* - * Allocate a swap block if necessary. + * at this point: + * "m" is a pointer to the array of vm_page_t for paging I/O + * "count" is the number of vm_page_t entries represented by "m" + * "object" is the vm_object_t for I/O + * "reqpage" is the index into "m" for the page actually faulted */ - if (swb->swb_block == 0) { - swb->swb_block = rmalloc(swapmap, swp->sw_bsize); - if (swb->swb_block == 0) { -#ifdef DEBUG - if (swpagerdebug & SDB_FAIL) - printf("swpg_io: rmalloc of %x failed\n", - swp->sw_bsize); -#endif - /* - * XXX this is technically a resource shortage that - * should return AGAIN, but the situation isn't likely - * to be remedied just by delaying a little while and - * trying again (the pageout daemon's current response - * to AGAIN) so we just return FAIL. - */ - return(VM_PAGER_FAIL); + + spc = NULL; /* we might not use an spc data structure */ + kva = 0; + + /* + * we allocate a new kva for transfers > 1 page + * but for transfers == 1 page, the swap_pager_free list contains + * entries that have pre-allocated kva's (for efficiency). + */ + if (count > 1) { + kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE); + } + + + if (!kva) { + /* + * if a kva has not been allocated, we can only do a one page transfer, + * so we free the other pages that might have been allocated by + * vm_fault. + */ + swap_pager_ridpages(m, count, reqpage); + m[0] = m[reqpage]; + reqaddr[0] = reqaddr[reqpage]; + + count = 1; + reqpage = 0; + /* + * get a swap pager clean data structure, block until we get it + */ + if (swap_pager_free.tqh_first == NULL) { + s = splbio(); + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + while (swap_pager_free.tqh_first == NULL) { + swap_pager_needflags |= SWAP_FREE_NEEDED; + tsleep((caddr_t)&swap_pager_free, + PVM, "swpfre", 0); + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + } + splx(s); } -#ifdef DEBUG - if (swpagerdebug & (SDB_FULL|SDB_ALLOCBLK)) - printf("swpg_io: %x alloc blk %x at ix %x\n", - swp->sw_blocks, swb->swb_block, ix); -#endif + spc = swap_pager_free.tqh_first; + TAILQ_REMOVE(&swap_pager_free, spc, spc_list); + kva = spc->spc_kva; } + /* - * Allocate a kernel virtual address and initialize so that PTE - * is available for lower level IO drivers. + * map our page(s) into kva for input */ - kva = vm_pager_map_pages(mlist, npages, !(flags & B_ASYNC)); - if (kva == NULL) { -#ifdef DEBUG - if (swpagerdebug & SDB_FAIL) - printf("%s: no KVA space to map pages\n", - "swap_pager_io"); -#endif - return(VM_PAGER_AGAIN); + for (i = 0; i < count; i++) { + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); } + pmap_update(); + /* - * Get a swap buffer header and initialize it. + * Get a swap buffer header and perform the IO */ - s = splbio(); - while (bswlist.b_actf == NULL) { -#ifdef DEBUG - if (swpagerdebug & SDB_ANOM) - printf("swap_pager_io: wait on swbuf for %x (%d)\n", - m, flags); -#endif - bswlist.b_flags |= B_WANTED; - tsleep((caddr_t)&bswlist, PSWP+1, "swpgiobuf", 0); + if( spc) { + bp = spc->spc_bp; + bzero(bp, sizeof *bp); + bp->b_spc = spc; + } else { + bp = getpbuf(); } - bp = bswlist.b_actf; - bswlist.b_actf = bp->b_actf; - splx(s); - bp->b_flags = B_BUSY | (flags & B_READ); + + s = splbio(); + bp->b_flags = B_BUSY | B_READ | B_CALL; + bp->b_iodone = swap_pager_iodone1; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ - bp->b_data = (caddr_t)kva; - bp->b_blkno = swb->swb_block + btodb(off); + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + crhold(bp->b_rcred); + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = reqaddr[0]; + bp->b_bcount = PAGE_SIZE*count; + bp->b_bufsize = PAGE_SIZE*count; + +/* VHOLD(swapdev_vp); bp->b_vp = swapdev_vp; if (swapdev_vp->v_type == VBLK) bp->b_dev = swapdev_vp->v_rdev; - bp->b_bcount = npages * PAGE_SIZE; +*/ + bgetvp( swapdev_vp, bp); + + swp->sw_piip++; /* - * For writes we set up additional buffer fields, record a pageout - * in progress and mark that these swap blocks are now allocated. + * perform the I/O */ - if ((bp->b_flags & B_READ) == 0) { - bp->b_dirtyoff = 0; - bp->b_dirtyend = npages * PAGE_SIZE; - swapdev_vp->v_numoutput++; + VOP_STRATEGY(bp); + + /* + * wait for the sync I/O to complete + */ + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "swread", 0); + } + rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK; + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE); + + --swp->sw_piip; + if (swp->sw_piip == 0) + wakeup((caddr_t) swp); + + /* + * relpbuf does this, but we maintain our own buffer + * list also... + */ + if (bp->b_vp) + brelvp(bp); + + splx(s); + --swb[reqpage]->swb_locked; + + /* + * remove the mapping for kernel virtual + */ + pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE); + + if (spc) { + /* + * if we have used an spc, we need to free it. + */ + if( bp->b_rcred != NOCRED) + crfree(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crfree(bp->b_wcred); + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + if (swap_pager_needflags & SWAP_FREE_NEEDED) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); + } + } else { + /* + * free the kernel virtual addresses + */ + kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE); + /* + * release the physical I/O buffer + */ + relpbuf(bp); + /* + * finish up input if everything is ok + */ + if( rv == VM_PAGER_OK) { + for (i = 0; i < count; i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + if (i != reqpage) { + /* + * whether or not to leave the page activated + * is up in the air, but we should put the page + * on a page queue somewhere. (it already is in + * the object). + * After some emperical results, it is best + * to deactivate the readahead pages. + */ + vm_page_deactivate(m[i]); + m[i]->act_count = 2; + + /* + * just in case someone was asking for this + * page we now tell them that it is ok to use + */ + m[i]->flags &= ~PG_FAKE; + PAGE_WAKEUP(m[i]); + } + } + if( swap_pager_full) { + _swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE); + } + } else { + swap_pager_ridpages(m, count, reqpage); + } + } + return(rv); +} + +int +swap_pager_output(swp, m, count, flags, rtvals) + register sw_pager_t swp; + vm_page_t *m; + int count; + int flags; + int *rtvals; +{ + register struct buf *bp; + sw_blk_t swb[count]; + register int s; + int i, j, ix; + boolean_t rv; + vm_offset_t kva, off, foff; + swp_clean_t spc; + vm_offset_t paging_offset; + vm_object_t object; + int reqaddr[count]; + int failed; + +/* + if( count > 1) + printf("off: 0x%x, count: %d\n", m[0]->offset, count); +*/ + spc = NULL; + + object = m[0]->object; + paging_offset = object->paging_offset; + + failed = 0; + for(j=0;j<count;j++) { + foff = m[j]->offset + paging_offset; + ix = swap_pager_block_index(swp, foff); + swb[j] = 0; + if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { + rtvals[j] = VM_PAGER_FAIL; + failed = 1; + continue; + } else { + rtvals[j] = VM_PAGER_OK; + } + swb[j] = &swp->sw_blocks[ix]; + ++swb[j]->swb_locked; + if( failed) { + rtvals[j] = VM_PAGER_FAIL; + continue; + } + off = swap_pager_block_offset(swp, foff); + reqaddr[j] = swb[j]->swb_block[off]; + if( reqaddr[j] == SWB_EMPTY) { + int blk; + int tries; + int ntoget; + tries = 0; + s = splbio(); + + /* + * if any other pages have been allocated in this block, we + * only try to get one page. + */ + for (i = 0; i < SWB_NPAGES; i++) { + if (swb[j]->swb_block[i] != SWB_EMPTY) + break; + } + + + ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1; + /* + * this code is alittle conservative, but works + * (the intent of this code is to allocate small chunks + * for small objects) + */ + if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) { + ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE; + } + +retrygetspace: + if (!swap_pager_full && ntoget > 1 && + swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) { + + for (i = 0; i < ntoget; i++) { + swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i; + swb[j]->swb_valid = 0; + } + + reqaddr[j] = swb[j]->swb_block[off]; + } else if (!swap_pager_getswapspace(btodb(PAGE_SIZE), + &swb[j]->swb_block[off])) { + /* + * if the allocation has failed, we try to reclaim space and + * retry. + */ + if (++tries == 1) { + swap_pager_reclaim(); + goto retrygetspace; + } + rtvals[j] = VM_PAGER_AGAIN; + failed = 1; + } else { + reqaddr[j] = swb[j]->swb_block[off]; + swb[j]->swb_valid &= ~(1<<off); + } + splx(s); + } + } + + /* + * search forwards for the last contiguous page to transfer + */ + failed = 0; + for (i = 0; i < count; i++) { + if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) || + (reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) || + (rtvals[i] != VM_PAGER_OK)) { + failed = 1; + if( rtvals[i] == VM_PAGER_OK) + rtvals[i] = VM_PAGER_AGAIN; + } + } + + for(i = 0; i < count; i++) { + if( rtvals[i] != VM_PAGER_OK) { + if( swb[i]) + --swb[i]->swb_locked; + } + } + + for(i = 0; i < count; i++) + if( rtvals[i] != VM_PAGER_OK) + break; + + if( i == 0) { + return VM_PAGER_AGAIN; + } + + count = i; + for(i=0;i<count;i++) { + if( reqaddr[i] == SWB_EMPTY) + printf("I/O to empty block????\n"); + } + + /* + */ + + /* + * For synchronous writes, we clean up + * all completed async pageouts. + */ + if ((flags & B_ASYNC) == 0) { + swap_pager_clean(); + } + + kva = 0; + + /* + * we allocate a new kva for transfers > 1 page + * but for transfers == 1 page, the swap_pager_free list contains + * entries that have pre-allocated kva's (for efficiency). + */ + if ( count > 1) { + kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE); + if( !kva) { + for (i = 0; i < count; i++) { + if( swb[i]) + --swb[i]->swb_locked; + rtvals[i] = VM_PAGER_AGAIN; + } + return VM_PAGER_AGAIN; + } + } + + /* + * get a swap pager clean data structure, block until we get it + */ + if (swap_pager_free.tqh_first == NULL) { +/* + if (flags & B_ASYNC) { + for(i=0;i<count;i++) { + rtvals[i] = VM_PAGER_AGAIN; + if( swb[i]) + --swb[i]->swb_locked; + } + return VM_PAGER_AGAIN; + } +*/ + s = splbio(); - swp->sw_poip++; + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + while (swap_pager_free.tqh_first == NULL) { + swap_pager_needflags |= SWAP_FREE_NEEDED; + tsleep((caddr_t)&swap_pager_free, + PVM, "swpfre", 0); + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + } splx(s); - mask = (~(~0 << npages)) << atop(off); -#ifdef DEBUG - swap_pager_poip++; - if (swpagerdebug & SDB_WRITE) - printf("swpg_io: write: bp=%x swp=%x poip=%d\n", - bp, swp, swp->sw_poip); - if ((swpagerdebug & SDB_ALLOCBLK) && - (swb->swb_mask & mask) != mask) - printf("swpg_io: %x write %d pages at %x+%x\n", - swp->sw_blocks, npages, swb->swb_block, - atop(off)); - if (swpagerdebug & SDB_CLUSTER) - printf("swpg_io: off=%x, npg=%x, mask=%x, bmask=%x\n", - off, npages, mask, swb->swb_mask); -#endif - swb->swb_mask |= mask; } + + spc = swap_pager_free.tqh_first; + TAILQ_REMOVE(&swap_pager_free, spc, spc_list); + if( !kva) { + kva = spc->spc_kva; + spc->spc_altkva = 0; + } else { + spc->spc_altkva = kva; + } + + /* + * map our page(s) into kva for I/O + */ + for (i = 0; i < count; i++) { + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + } + pmap_update(); + /* - * If this is an async write we set up still more buffer fields + * get the base I/O offset into the swap file + */ + for(i=0;i<count;i++) { + foff = m[i]->offset + paging_offset; + off = swap_pager_block_offset(swp, foff); + /* + * if we are setting the valid bit anew, + * then diminish the swap free space + */ + if( (swb[i]->swb_valid & (1 << off)) == 0) + vm_swap_size -= btodb(PAGE_SIZE); + + /* + * set the valid bit + */ + swb[i]->swb_valid |= (1 << off); + /* + * and unlock the data structure + */ + --swb[i]->swb_locked; + } + + s = splbio(); + /* + * Get a swap buffer header and perform the IO + */ + bp = spc->spc_bp; + bzero(bp, sizeof *bp); + bp->b_spc = spc; + + bp->b_flags = B_BUSY; + bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + crhold(bp->b_rcred); + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = reqaddr[0]; + bgetvp( swapdev_vp, bp); +/* + VHOLD(swapdev_vp); + bp->b_vp = swapdev_vp; + if (swapdev_vp->v_type == VBLK) + bp->b_dev = swapdev_vp->v_rdev; +*/ + bp->b_bcount = PAGE_SIZE*count; + bp->b_bufsize = PAGE_SIZE*count; + swapdev_vp->v_numoutput++; + + /* + * If this is an async write we set up additional buffer fields * and place a "cleaning" entry on the inuse queue. */ - if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) { -#ifdef DEBUG - if (swap_pager_free.tqh_first == NULL) - panic("swpg_io: lost spc"); -#endif - spc = swap_pager_free.tqh_first; - TAILQ_REMOVE(&swap_pager_free, spc, spc_list); -#ifdef DEBUG - if (spc->spc_flags != SPC_FREE) - panic("swpg_io: bad free spc"); -#endif - spc->spc_flags = SPC_BUSY; - spc->spc_bp = bp; + if ( flags & B_ASYNC ) { + spc->spc_flags = 0; spc->spc_swp = swp; - spc->spc_kva = kva; + for(i=0;i<count;i++) + spc->spc_m[i] = m[i]; + spc->spc_count = count; /* - * Record the first page. This allows swap_pager_clean - * to efficiently handle the common case of a single page. - * For clusters, it allows us to locate the object easily - * and we then reconstruct the rest of the mlist from spc_kva. + * the completion routine for async writes */ - spc->spc_m = m; - spc->spc_npages = npages; bp->b_flags |= B_CALL; bp->b_iodone = swap_pager_iodone; - s = splbio(); + bp->b_dirtyoff = 0; + bp->b_dirtyend = bp->b_bcount; + swp->sw_poip++; TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list); + } else { + swp->sw_poip++; + bp->b_flags |= B_CALL; + bp->b_iodone = swap_pager_iodone1; + } + /* + * perform the I/O + */ + VOP_STRATEGY(bp); + if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) { + if ((bp->b_flags & B_DONE) == B_DONE) { + swap_pager_clean(); + } splx(s); + for(i=0;i<count;i++) { + rtvals[i] = VM_PAGER_PEND; + } + return VM_PAGER_PEND; } /* - * Finally, start the IO operation. - * If it is async we are all done, otherwise we must wait for - * completion and cleanup afterwards. + * wait for the sync I/O to complete */ -#ifdef DEBUG - if (swpagerdebug & SDB_IO) - printf("swpg_io: IO start: bp %x, db %x, va %x, pa %x\n", - bp, swb->swb_block+btodb(off), kva, VM_PAGE_TO_PHYS(m)); -#endif - VOP_STRATEGY(bp); - if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) { -#ifdef DEBUG - if (swpagerdebug & SDB_IO) - printf("swpg_io: IO started: bp %x\n", bp); -#endif - return(VM_PAGER_PEND); + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "swwrt", 0); } - s = splbio(); -#ifdef DEBUG - if (flags & B_READ) - swap_pager_piip++; - else - swap_pager_poip++; -#endif - while ((bp->b_flags & B_DONE) == 0) - (void) tsleep(bp, PVM, "swpgio", 0); - if ((flags & B_READ) == 0) - --swp->sw_poip; -#ifdef DEBUG - if (flags & B_READ) - --swap_pager_piip; - else - --swap_pager_poip; -#endif - rv = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; - bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); - bp->b_actf = bswlist.b_actf; - bswlist.b_actf = bp; + rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK; + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE); + + --swp->sw_poip; + if (swp->sw_poip == 0) + wakeup((caddr_t) swp); + if (bp->b_vp) brelvp(bp); - if (bswlist.b_flags & B_WANTED) { - bswlist.b_flags &= ~B_WANTED; - wakeup(&bswlist); + + splx(s); + + /* + * remove the mapping for kernel virtual + */ + pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE); + + /* + * if we have written the page, then indicate that the page + * is clean. + */ + if (rv == VM_PAGER_OK) { + for(i=0;i<count;i++) { + if( rtvals[i] == VM_PAGER_OK) { + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + /* + * optimization, if a page has been read during the + * pageout process, we activate it. + */ + if ( (m[i]->flags & PG_ACTIVE) == 0 && + pmap_is_referenced(VM_PAGE_TO_PHYS(m[i]))) + vm_page_activate(m[i]); + } + } + } else { + for(i=0;i<count;i++) { + rtvals[i] = rv; + m[i]->flags |= PG_LAUNDRY; + } } - if ((flags & B_READ) == 0 && rv == VM_PAGER_OK) { - m->flags |= PG_CLEAN; - pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + + if( spc->spc_altkva) + kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE); + + if( bp->b_rcred != NOCRED) + crfree(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crfree(bp->b_wcred); + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + if (swap_pager_needflags & SWAP_FREE_NEEDED) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); } - splx(s); -#ifdef DEBUG - if (swpagerdebug & SDB_IO) - printf("swpg_io: IO done: bp %x, rv %d\n", bp, rv); - if ((swpagerdebug & SDB_FAIL) && rv == VM_PAGER_ERROR) - printf("swpg_io: IO error\n"); -#endif - vm_pager_unmap_pages(kva, npages); + return(rv); } -static void -swap_pager_clean(rw) - int rw; +boolean_t +swap_pager_clean() { - register swp_clean_t spc; - register int s, i; - vm_object_t object; - vm_page_t m; - -#ifdef DEBUG - /* save panic time state */ - if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) - return; - if (swpagerdebug & SDB_FOLLOW) - printf("swpg_clean(%x)\n", rw); -#endif + register swp_clean_t spc, tspc; + register int s; + tspc = NULL; + if (swap_pager_done.tqh_first == NULL) + return FALSE; for (;;) { + s = splbio(); /* - * Look up and removal from inuse list must be done + * Look up and removal from done list must be done * at splbio() to avoid conflicts with swap_pager_iodone. */ - s = splbio(); - for (spc = swap_pager_inuse.tqh_first; - spc != NULL; - spc = spc->spc_list.tqe_next) { - /* - * If the operation is done, remove it from the - * list and process it. - * - * XXX if we can't get the object lock we also - * leave it on the list and try again later. - * Is there something better we could do? - */ - if ((spc->spc_flags & SPC_DONE) && - vm_object_lock_try(spc->spc_m->object)) { - TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); - break; + while (spc = swap_pager_done.tqh_first) { + if( spc->spc_altkva) { + pmap_remove(vm_map_pmap(pager_map), spc->spc_altkva, spc->spc_altkva + spc->spc_count * PAGE_SIZE); + kmem_free_wakeup(pager_map, spc->spc_altkva, spc->spc_count * PAGE_SIZE); + spc->spc_altkva = 0; + } else { + pmap_remove(vm_map_pmap(pager_map), spc->spc_kva, spc->spc_kva + PAGE_SIZE); } + swap_pager_finish(spc); + TAILQ_REMOVE(&swap_pager_done, spc, spc_list); + goto doclean; } - splx(s); /* * No operations done, thats all we can do for now. */ - if (spc == NULL) - break; - /* - * Found a completed operation so finish it off. - * Note: no longer at splbio since entry is off the list. - */ - m = spc->spc_m; - object = m->object; + splx(s); + break; /* - * Process each page in the cluster. - * The first page is explicitly kept in the cleaning - * entry, others must be reconstructed from the KVA. + * The desired page was found to be busy earlier in + * the scan but has since completed. */ - for (i = 0; i < spc->spc_npages; i++) { - if (i) - m = vm_pager_atop(spc->spc_kva + ptoa(i)); - /* - * If no error mark as clean and inform the pmap - * system. If there was an error, mark as dirty - * so we will try again. - * - * XXX could get stuck doing this, should give up - * after awhile. - */ - if (spc->spc_flags & SPC_ERROR) { - printf("%s: clean of page %x failed\n", - "swap_pager_clean", - VM_PAGE_TO_PHYS(m)); - m->flags |= PG_LAUNDRY; - } else { - m->flags |= PG_CLEAN; - pmap_clear_modify(VM_PAGE_TO_PHYS(m)); - } - m->flags &= ~PG_BUSY; - PAGE_WAKEUP(m); +doclean: + if (tspc && tspc == spc) { + tspc = NULL; } + spc->spc_flags = 0; + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + if (swap_pager_needflags & SWAP_FREE_NEEDED) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); + } + ++cleandone; + splx(s); + } - /* - * Done with the object, decrement the paging count - * and unlock it. - */ - if (--object->paging_in_progress == 0) - wakeup(object); - vm_object_unlock(object); + return(tspc ? TRUE : FALSE); +} +void +swap_pager_finish(spc) + register swp_clean_t spc; +{ + vm_object_t object = spc->spc_m[0]->object; + int i; + + if ((object->paging_in_progress -= spc->spc_count) == 0) + thread_wakeup((int) object); + + /* + * If no error mark as clean and inform the pmap system. + * If error, mark as dirty so we will try again. + * (XXX could get stuck doing this, should give up after awhile) + */ + if (spc->spc_flags & SPC_ERROR) { + for(i=0;i<spc->spc_count;i++) { + printf("swap_pager_finish: clean of page %x failed\n", + VM_PAGE_TO_PHYS(spc->spc_m[i])); + spc->spc_m[i]->flags |= PG_LAUNDRY; + } + } else { + for(i=0;i<spc->spc_count;i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i])); + spc->spc_m[i]->flags |= PG_CLEAN; + } + } + + + for(i=0;i<spc->spc_count;i++) { /* - * Free up KVM used and put the entry back on the list. + * we wakeup any processes that are waiting on + * these pages. */ - vm_pager_unmap_pages(spc->spc_kva, spc->spc_npages); - spc->spc_flags = SPC_FREE; - TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); -#ifdef DEBUG - if (swpagerdebug & SDB_WRITE) - printf("swpg_clean: free spc %x\n", spc); -#endif + PAGE_WAKEUP(spc->spc_m[i]); } + nswiodone -= spc->spc_count; + + return; } -#ifdef DEBUG -static void -swap_pager_clean_check(mlist, npages, rw) - vm_page_t *mlist; - int npages; - int rw; +/* + * swap_pager_iodone + */ +void +swap_pager_iodone(bp) + register struct buf *bp; { register swp_clean_t spc; - boolean_t bad; - int i, j, s; - vm_page_t m; + int s; - if (panicstr) - return; + s = splbio(); + spc = (swp_clean_t) bp->b_spc; + TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); + TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list); + if (bp->b_flags & B_ERROR) { + spc->spc_flags |= SPC_ERROR; + printf("error %d blkno %d sz %d ", + bp->b_error, bp->b_blkno, bp->b_bcount); + } + +/* + if ((bp->b_flags & B_READ) == 0) + vwakeup(bp); +*/ + + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC); + if (bp->b_vp) { + brelvp(bp); + } + if( bp->b_rcred != NOCRED) + crfree(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crfree(bp->b_wcred); + + nswiodone += spc->spc_count; + if (--spc->spc_swp->sw_poip == 0) { + wakeup((caddr_t)spc->spc_swp); + } + + if ((swap_pager_needflags & SWAP_FREE_NEEDED) || + swap_pager_inuse.tqh_first == 0) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); + wakeup((caddr_t)&vm_pages_needed); + } + + if (vm_pageout_pages_needed) { + wakeup((caddr_t)&vm_pageout_pages_needed); + } + + if ((swap_pager_inuse.tqh_first == NULL) || + (cnt.v_free_count < cnt.v_free_min && + nswiodone + cnt.v_free_count >= cnt.v_free_min) ) { + wakeup((caddr_t)&vm_pages_needed); + } + splx(s); +} + +int bswneeded; +/* TAILQ_HEAD(swqueue, buf) bswlist; */ +/* + * allocate a physical buffer + */ +struct buf * +getpbuf() { + int s; + struct buf *bp; - bad = FALSE; s = splbio(); - for (spc = swap_pager_inuse.tqh_first; - spc != NULL; - spc = spc->spc_list.tqe_next) { - for (j = 0; j < spc->spc_npages; j++) { - m = vm_pager_atop(spc->spc_kva + ptoa(j)); - for (i = 0; i < npages; i++) - if (m == mlist[i]) { - if (swpagerdebug & SDB_ANOM) - printf( - "swpg_clean_check: %s: page %x on list, flags %x\n", - rw == B_WRITE ? "write" : "read", mlist[i], spc->spc_flags); - bad = TRUE; - } - } + /* get a bp from the swap buffer header pool */ + while ((bp = bswlist.tqh_first) == NULL) { + bswneeded = 1; + tsleep((caddr_t)&bswneeded, PVM, "wswbuf", 0); } + TAILQ_REMOVE(&bswlist, bp, b_freelist); + splx(s); - if (bad) - panic("swpg_clean_check"); + + bzero(bp, sizeof *bp); + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + return bp; } -#endif -static void -swap_pager_iodone(bp) - register struct buf *bp; -{ - register swp_clean_t spc; - daddr_t blk; +/* + * allocate a physical buffer, if one is available + */ +struct buf * +trypbuf() { int s; + struct buf *bp; -#ifdef DEBUG - /* save panic time state */ - if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) - return; - if (swpagerdebug & SDB_FOLLOW) - printf("swpg_iodone(%x)\n", bp); -#endif s = splbio(); - for (spc = swap_pager_inuse.tqh_first; - spc != NULL; - spc = spc->spc_list.tqe_next) - if (spc->spc_bp == bp) - break; -#ifdef DEBUG - if (spc == NULL) - panic("swap_pager_iodone: bp not found"); -#endif + if ((bp = bswlist.tqh_first) == NULL) { + splx(s); + return NULL; + } + TAILQ_REMOVE(&bswlist, bp, b_freelist); + splx(s); - spc->spc_flags &= ~SPC_BUSY; - spc->spc_flags |= SPC_DONE; - if (bp->b_flags & B_ERROR) - spc->spc_flags |= SPC_ERROR; - spc->spc_bp = NULL; - blk = bp->b_blkno; - -#ifdef DEBUG - --swap_pager_poip; - if (swpagerdebug & SDB_WRITE) - printf("swpg_iodone: bp=%x swp=%x flags=%x spc=%x poip=%x\n", - bp, spc->spc_swp, spc->spc_swp->sw_flags, - spc, spc->spc_swp->sw_poip); -#endif + bzero(bp, sizeof *bp); + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + return bp; +} + +/* + * release a physical buffer + */ +void +relpbuf(bp) + struct buf *bp; +{ + int s; - spc->spc_swp->sw_poip--; - if (spc->spc_swp->sw_flags & SW_WANTED) { - spc->spc_swp->sw_flags &= ~SW_WANTED; - wakeup(spc->spc_swp); + s = splbio(); + + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; } - - bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); - bp->b_actf = bswlist.b_actf; - bswlist.b_actf = bp; + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (bp->b_vp) brelvp(bp); - if (bswlist.b_flags & B_WANTED) { - bswlist.b_flags &= ~B_WANTED; - wakeup(&bswlist); + + TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); + + if (bswneeded) { + bswneeded = 0; + wakeup((caddr_t)&bswlist); } - wakeup(&vm_pages_needed); splx(s); } + +/* + * return true if any swap control structures can be allocated + */ +int +swap_pager_ready() { + if( swap_pager_free.tqh_first) + return 1; + else + return 0; +} diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index 497d92a..853edd5 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -1,7 +1,7 @@ /* * Copyright (c) 1990 University of Utah. - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer @@ -35,39 +35,31 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)swap_pager.h 8.1 (Berkeley) 6/11/93 + * from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90 + * $Id: swap_pager.h,v 1.9 1994/03/14 21:54:23 davidg Exp $ + */ + +/* + * Modifications to the block allocation data structure by John S. Dyson + * 18 Dec 93. */ #ifndef _SWAP_PAGER_ #define _SWAP_PAGER_ 1 /* - * In the swap pager, the backing store for an object is organized as an - * array of some number of "swap blocks". A swap block consists of a bitmask - * and some number of contiguous DEV_BSIZE disk blocks. The minimum size - * of a swap block is: - * - * max(PAGE_SIZE, dmmin*DEV_BSIZE) [ 32k currently ] - * - * bytes (since the pager interface is page oriented), the maximum size is: - * - * min(#bits(swb_mask)*PAGE_SIZE, dmmax*DEV_BSIZE) [ 128k currently ] - * - * where dmmin and dmmax are left over from the old VM interface. The bitmask - * (swb_mask) is used by swap_pager_haspage() to determine if a particular - * page has actually been written; i.e. the pager copy of the page is valid. - * All swap blocks in the backing store of an object will be the same size. - * - * The reason for variable sized swap blocks is to reduce fragmentation of - * swap resources. Whenever possible we allocate smaller swap blocks to - * smaller objects. The swap block size is determined from a table of - * object-size vs. swap-block-size computed at boot time. + * SWB_NPAGES can be set to any value from 1 to 16 pages per allocation, + * however, due to the allocation spilling into non-swap pager backed memory, + * suggest keeping SWB_NPAGES small (1-4). If high performance is manditory + * perhaps up to 8 pages might be in order???? + * Above problem has been fixed, now we support 16 pages per block. Unused + * space is recovered by the swap pager now... */ -typedef int sw_bm_t; /* pager bitmask */ - +#define SWB_NPAGES 8 struct swblock { - sw_bm_t swb_mask; /* bitmask of valid pages in this block */ - daddr_t swb_block; /* starting disk block for this block */ + unsigned short swb_valid; /* bitmask for valid pages */ + unsigned short swb_locked; /* block locked */ + int swb_block[SWB_NPAGES]; /* unfortunately int instead of daddr_t */ }; typedef struct swblock *sw_blk_t; @@ -76,15 +68,32 @@ typedef struct swblock *sw_blk_t; */ struct swpager { vm_size_t sw_osize; /* size of object we are backing (bytes) */ - int sw_bsize; /* size of swap blocks (DEV_BSIZE units) */ int sw_nblocks;/* number of blocks in list (sw_blk_t units) */ sw_blk_t sw_blocks; /* pointer to list of swap blocks */ short sw_flags; /* flags */ short sw_poip; /* pageouts in progress */ + short sw_piip; /* pageins in progress */ }; typedef struct swpager *sw_pager_t; #define SW_WANTED 0x01 #define SW_NAMED 0x02 +#ifdef KERNEL + +void swap_pager_init(void); +vm_pager_t swap_pager_alloc(caddr_t, vm_size_t, vm_prot_t, vm_offset_t); +void swap_pager_dealloc(vm_pager_t); +boolean_t swap_pager_getpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_putpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_getmulti(vm_pager_t, vm_page_t *, int, int, boolean_t); +boolean_t swap_pager_haspage(vm_pager_t, vm_offset_t); +int swap_pager_io(sw_pager_t, vm_page_t *, int, int, int); +void swap_pager_iodone(struct buf *); +boolean_t swap_pager_clean(); + +extern struct pagerops swappagerops; + +#endif + #endif /* _SWAP_PAGER_ */ diff --git a/sys/vm/vm.h b/sys/vm/vm.h index 85f892f..bc18dd2 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -36,7 +36,7 @@ #ifndef VM_H #define VM_H -typedef int vm_inherit_t; /* XXX: inheritance codes */ +typedef char vm_inherit_t; /* XXX: inheritance codes */ union vm_map_object; typedef union vm_map_object vm_map_object_t; @@ -58,6 +58,7 @@ typedef struct pager_struct *vm_pager_t; #include <sys/vmmeter.h> #include <sys/queue.h> +#include <machine/cpufunc.h> #include <vm/vm_param.h> #include <vm/lock.h> #include <vm/vm_prot.h> @@ -87,5 +88,6 @@ struct vmspace { caddr_t vm_taddr; /* user virtual address of text XXX */ caddr_t vm_daddr; /* user virtual address of data XXX */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ + caddr_t vm_minsaddr; /* user VA at max stack growth */ }; #endif /* VM_H */ diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index bae5f00..bc62e42 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -45,6 +45,16 @@ struct vnode; void chgkprot __P((caddr_t, int, int)); #endif +/* + * Try to get semi-meaningful wait messages into thread_sleep... + */ +extern void thread_sleep_(int, simple_lock_t, char *); +#if __GNUC__ >= 2 +#define thread_sleep(a,b,c) thread_sleep_((a), (b), __FUNCTION__) +#else +#define thread_sleep(a,b,c) thread_sleep_((a), (b), "vmslp") +#endif + #ifdef KERNEL #ifdef TYPEDEF_FOR_UAP int getpagesize __P((struct proc *p, void *, int *)); @@ -88,7 +98,7 @@ void swapout __P((struct proc *)); void swapout_threads __P((void)); int swfree __P((struct proc *, int)); void swstrategy __P((struct buf *)); -void thread_block __P((void)); +void thread_block __P((char *)); void thread_sleep __P((int, simple_lock_t, boolean_t)); void thread_wakeup __P((int)); int useracc __P((caddr_t, int, int)); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index f60abf2..3ce2d6e 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1,6 +1,11 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. @@ -68,11 +73,21 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> #include <vm/vm.h> #include <vm/vm_page.h> #include <vm/vm_pageout.h> + +#define VM_FAULT_READ_AHEAD 4 +#define VM_FAULT_READ_AHEAD_MIN 1 +#define VM_FAULT_READ_BEHIND 3 +#define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1) +extern int swap_pager_full; +extern int vm_pageout_proc_limit; + /* * vm_fault: * @@ -103,7 +118,7 @@ vm_fault(map, vaddr, fault_type, change_wiring) vm_map_entry_t entry; register vm_object_t object; register vm_offset_t offset; - register vm_page_t m; + vm_page_t m; vm_page_t first_m; vm_prot_t prot; int result; @@ -113,6 +128,10 @@ vm_fault(map, vaddr, fault_type, change_wiring) boolean_t page_exists; vm_page_t old_m; vm_object_t next_object; + vm_page_t marray[VM_FAULT_READ]; + int reqpage; + int spl; + int hardfault=0; cnt.v_faults++; /* needs lock XXX */ /* @@ -141,11 +160,15 @@ vm_fault(map, vaddr, fault_type, change_wiring) #define UNLOCK_THINGS { \ object->paging_in_progress--; \ + if (object->paging_in_progress == 0) \ + wakeup((caddr_t)object); \ vm_object_unlock(object); \ if (object != first_object) { \ vm_object_lock(first_object); \ FREE_PAGE(first_m); \ first_object->paging_in_progress--; \ + if (first_object->paging_in_progress == 0) \ + wakeup((caddr_t)first_object); \ vm_object_unlock(first_object); \ } \ UNLOCK_MAP; \ @@ -156,6 +179,7 @@ vm_fault(map, vaddr, fault_type, change_wiring) vm_object_deallocate(first_object); \ } + RetryFault: ; /* @@ -164,8 +188,8 @@ vm_fault(map, vaddr, fault_type, change_wiring) */ if ((result = vm_map_lookup(&map, vaddr, fault_type, &entry, - &first_object, &first_offset, - &prot, &wired, &su)) != KERN_SUCCESS) { + &first_object, &first_offset, + &prot, &wired, &su)) != KERN_SUCCESS) { return(result); } lookup_still_valid = TRUE; @@ -241,25 +265,13 @@ vm_fault(map, vaddr, fault_type, change_wiring) * wait for it and then retry. */ if (m->flags & PG_BUSY) { -#ifdef DOTHREADS - int wait_result; - - PAGE_ASSERT_WAIT(m, !change_wiring); - UNLOCK_THINGS; - thread_block(); - wait_result = current_thread()->wait_result; - vm_object_deallocate(first_object); - if (wait_result != THREAD_AWAKENED) - return(KERN_SUCCESS); - goto RetryFault; -#else - PAGE_ASSERT_WAIT(m, !change_wiring); UNLOCK_THINGS; - cnt.v_intrans++; - thread_block(); + if (m->flags & PG_BUSY) { + m->flags |= PG_WANTED; + tsleep((caddr_t)m,PSWP,"vmpfw",0); + } vm_object_deallocate(first_object); goto RetryFault; -#endif } /* @@ -268,6 +280,7 @@ vm_fault(map, vaddr, fault_type, change_wiring) */ vm_page_lock_queues(); + spl = splimp(); if (m->flags & PG_INACTIVE) { TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); m->flags &= ~PG_INACTIVE; @@ -280,6 +293,7 @@ vm_fault(map, vaddr, fault_type, change_wiring) m->flags &= ~PG_ACTIVE; cnt.v_active_count--; } + splx(spl); vm_page_unlock_queues(); /* @@ -290,9 +304,31 @@ vm_fault(map, vaddr, fault_type, change_wiring) } if (((object->pager != NULL) && - (!change_wiring || wired)) + (!change_wiring || wired)) || (object == first_object)) { +#if 0 + if (curproc && (vaddr < VM_MAXUSER_ADDRESS) && + (curproc->p_rlimit[RLIMIT_RSS].rlim_max < + curproc->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG)) { + UNLOCK_AND_DEALLOCATE; + vm_fault_free_pages(curproc); + goto RetryFault; + } +#endif + + if (swap_pager_full && !object->shadow && (!object->pager || + (object->pager && object->pager->pg_type == PG_SWAP && + !vm_pager_has_page(object->pager, offset+object->paging_offset)))) { + if (vaddr < VM_MAXUSER_ADDRESS && curproc && curproc->p_pid >= 48) /* XXX */ { + printf("Process %d killed by vm_fault -- out of swap\n", curproc->p_pid); + psignal(curproc, SIGKILL); + curproc->p_estcpu = 0; + curproc->p_nice = PRIO_MIN; + setpriority(curproc); + } + } + /* * Allocate a new page for this object/offset * pair. @@ -309,33 +345,46 @@ vm_fault(map, vaddr, fault_type, change_wiring) if (object->pager != NULL && (!change_wiring || wired)) { int rv; + int faultcount; + int reqpage; /* * Now that we have a busy page, we can * release the object lock. */ vm_object_unlock(object); - /* - * Call the pager to retrieve the data, if any, - * after releasing the lock on the map. + * now we find out if any other pages should + * be paged in at this time + * this routine checks to see if the pages surrounding this fault + * reside in the same object as the page for this fault. If + * they do, then they are faulted in also into the + * object. The array "marray" returned contains an array of + * vm_page_t structs where one of them is the vm_page_t passed to + * the routine. The reqpage return value is the index into the + * marray for the vm_page_t passed to the routine. */ - UNLOCK_MAP; cnt.v_pageins++; - rv = vm_pager_get(object->pager, m, TRUE); + faultcount = vm_fault_additional_pages(first_object, first_offset, + m, VM_FAULT_READ_BEHIND, VM_FAULT_READ_AHEAD, marray, &reqpage); /* - * Reaquire the object lock to preserve our - * invariant. + * Call the pager to retrieve the data, if any, + * after releasing the lock on the map. */ - vm_object_lock(object); + UNLOCK_MAP; - /* - * Found the page. - * Leave it busy while we play with it. - */ + rv = faultcount ? + vm_pager_get_pages(object->pager, + marray, faultcount, reqpage, TRUE): VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* + * Found the page. + * Leave it busy while we play with it. + */ + vm_object_lock(object); + + /* * Relookup in case pager changed page. * Pager is responsible for disposition * of old page if moved. @@ -344,36 +393,42 @@ vm_fault(map, vaddr, fault_type, change_wiring) cnt.v_pgpgin++; m->flags &= ~PG_FAKE; - m->flags |= PG_CLEAN; pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + hardfault++; break; } /* - * IO error or page outside the range of the pager: - * cleanup and return an error. + * Remove the bogus page (which does not + * exist at this object/offset); before + * doing so, we must get back our object + * lock to preserve our invariant. + * + * Also wake up any other thread that may want + * to bring in this page. + * + * If this is the top-level object, we must + * leave the busy page to prevent another + * thread from rushing past us, and inserting + * the page in that object at the same time + * that we are. */ - if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { + + vm_object_lock(object); + /* + * Data outside the range of the pager; an error + */ + if ((rv == VM_PAGER_ERROR) || (rv == VM_PAGER_BAD)) { FREE_PAGE(m); UNLOCK_AND_DEALLOCATE; return(KERN_PROTECTION_FAILURE); /* XXX */ } - /* - * rv == VM_PAGER_FAIL: - * - * Page does not exist at this object/offset. - * Free the bogus page (waking up anyone waiting - * for it) and continue on to the next object. - * - * If this is the top-level object, we must - * leave the busy page to prevent another - * thread from rushing past us, and inserting - * the page in that object at the same time - * that we are. - */ if (object != first_object) { FREE_PAGE(m); - /* note that `m' is not used after this */ + /* + * XXX - we cannot just fall out at this + * point, m has been freed and is invalid! + */ } } @@ -398,6 +453,8 @@ vm_fault(map, vaddr, fault_type, change_wiring) */ if (object != first_object) { object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); vm_object_unlock(object); object = first_object; @@ -414,16 +471,20 @@ vm_fault(map, vaddr, fault_type, change_wiring) } else { vm_object_lock(next_object); - if (object != first_object) + if (object != first_object) { object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); + } vm_object_unlock(object); object = next_object; object->paging_in_progress++; } } - if ((m->flags & (PG_ACTIVE | PG_INACTIVE | PG_BUSY)) != PG_BUSY) - panic("vm_fault: active, inactive or !busy after main loop"); + if ((m->flags & (PG_ACTIVE|PG_INACTIVE) != 0) || + (m->flags & PG_BUSY) == 0) + panic("vm_fault: absent or active or inactive or not busy after main loop"); /* * PAGE HAS BEEN FOUND. @@ -486,9 +547,11 @@ vm_fault(map, vaddr, fault_type, change_wiring) */ vm_page_lock_queues(); + vm_page_activate(m); - vm_page_deactivate(m); pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); + if ((m->flags & PG_CLEAN) == 0) + m->flags |= PG_LAUNDRY; vm_page_unlock_queues(); /* @@ -496,6 +559,8 @@ vm_fault(map, vaddr, fault_type, change_wiring) */ PAGE_WAKEUP(m); object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); vm_object_unlock(object); /* @@ -517,6 +582,8 @@ vm_fault(map, vaddr, fault_type, change_wiring) * paging_in_progress to do that... */ object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); vm_object_collapse(object); object->paging_in_progress++; } @@ -572,38 +639,18 @@ vm_fault(map, vaddr, fault_type, change_wiring) copy_m = vm_page_lookup(copy_object, copy_offset); if (page_exists = (copy_m != NULL)) { if (copy_m->flags & PG_BUSY) { -#ifdef DOTHREADS - int wait_result; - - /* - * If the page is being brought - * in, wait for it and then retry. - */ - PAGE_ASSERT_WAIT(copy_m, !change_wiring); - RELEASE_PAGE(m); - copy_object->ref_count--; - vm_object_unlock(copy_object); - UNLOCK_THINGS; - thread_block(); - wait_result = current_thread()->wait_result; - vm_object_deallocate(first_object); - if (wait_result != THREAD_AWAKENED) - return(KERN_SUCCESS); - goto RetryFault; -#else /* * If the page is being brought * in, wait for it and then retry. */ - PAGE_ASSERT_WAIT(copy_m, !change_wiring); + PAGE_ASSERT_WAIT(copy_m, !change_wiring); RELEASE_PAGE(m); copy_object->ref_count--; vm_object_unlock(copy_object); UNLOCK_THINGS; - thread_block(); + thread_block("fltcpy"); vm_object_deallocate(first_object); goto RetryFault; -#endif } } @@ -625,8 +672,7 @@ vm_fault(map, vaddr, fault_type, change_wiring) * found that the copy_object's pager * doesn't have the page... */ - copy_m = vm_page_alloc(copy_object, - copy_offset); + copy_m = vm_page_alloc(copy_object, copy_offset); if (copy_m == NULL) { /* * Wait for a page, then retry. @@ -700,10 +746,16 @@ vm_fault(map, vaddr, fault_type, change_wiring) * pmaps use it.) */ vm_page_lock_queues(); + + vm_page_activate(old_m); + + pmap_page_protect(VM_PAGE_TO_PHYS(old_m), VM_PROT_NONE); + if ((old_m->flags & PG_CLEAN) == 0) + old_m->flags |= PG_LAUNDRY; copy_m->flags &= ~PG_CLEAN; - vm_page_activate(copy_m); /* XXX */ + vm_page_activate(copy_m); vm_page_unlock_queues(); PAGE_WAKEUP(copy_m); @@ -832,8 +884,18 @@ vm_fault(map, vaddr, fault_type, change_wiring) else vm_page_unwire(m); } - else + else { vm_page_activate(m); + } + + if( curproc && curproc->p_stats) { + if (hardfault) { + curproc->p_stats->p_ru.ru_majflt++; + } else { + curproc->p_stats->p_ru.ru_minflt++; + } + } + vm_page_unlock_queues(); /* @@ -857,9 +919,10 @@ vm_fault_wire(map, start, end) vm_map_t map; vm_offset_t start, end; { + register vm_offset_t va; register pmap_t pmap; - int rv; + int rv; pmap = vm_map_pmap(map); @@ -893,7 +956,8 @@ vm_fault_wire(map, start, end) * * Unwire a range of virtual addresses in a map. */ -void vm_fault_unwire(map, start, end) +void +vm_fault_unwire(map, start, end) vm_map_t map; vm_offset_t start, end; { @@ -942,13 +1006,13 @@ void vm_fault_unwire(map, start, end) * entry corresponding to a main map entry that is wired down). */ -void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) +void +vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) vm_map_t dst_map; vm_map_t src_map; vm_map_entry_t dst_entry; vm_map_entry_t src_entry; { - vm_object_t dst_object; vm_object_t src_object; vm_offset_t dst_offset; @@ -960,7 +1024,7 @@ void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) #ifdef lint src_map++; -#endif +#endif lint src_object = src_entry->object.vm_object; src_offset = src_entry->offset; @@ -1031,5 +1095,211 @@ void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) PAGE_WAKEUP(dst_m); vm_object_unlock(dst_object); } +} + + +/* + * looks page up in shadow chain + */ + +int +vm_fault_page_lookup(object, offset, rtobject, rtoffset, rtm) + vm_object_t object; + vm_offset_t offset; + vm_object_t *rtobject; + vm_offset_t *rtoffset; + vm_page_t *rtm; +{ + vm_page_t m; + vm_object_t first_object = object; + + *rtm = 0; + *rtobject = 0; + *rtoffset = 0; + + + while (!(m=vm_page_lookup(object, offset))) { + if (object->pager) { + if (vm_pager_has_page(object->pager, object->paging_offset+offset)) { + *rtobject = object; + *rtoffset = offset; + return 1; + } + } + + if (!object->shadow) + return 0; + else { + offset += object->shadow_offset; + object = object->shadow; + } + } + *rtobject = object; + *rtoffset = offset; + *rtm = m; + return 1; +} + +/* + * This routine checks around the requested page for other pages that + * might be able to be faulted in. + * + * Inputs: + * first_object, first_offset, m, rbehind, rahead + * + * Outputs: + * marray (array of vm_page_t), reqpage (index of requested page) + * + * Return value: + * number of pages in marray + */ +int +vm_fault_additional_pages(first_object, first_offset, m, rbehind, raheada, marray, reqpage) + vm_object_t first_object; + vm_offset_t first_offset; + vm_page_t m; + int rbehind; + int raheada; + vm_page_t *marray; + int *reqpage; +{ + int i; + vm_page_t tmpm; + vm_object_t object; + vm_offset_t offset, startoffset, endoffset, toffset, size; + vm_object_t rtobject; + vm_page_t rtm; + vm_offset_t rtoffset; + vm_offset_t offsetdiff; + int rahead; + int treqpage; + + object = m->object; + offset = m->offset; + + offsetdiff = offset - first_offset; + + /* + * if the requested page is not available, then give up now + */ + + if (!vm_pager_has_page(object->pager, object->paging_offset+offset)) + return 0; + + /* + * if there is no getmulti routine for this pager, then just allow + * one page to be read. + */ +/* + if (!object->pager->pg_ops->pgo_getpages) { + *reqpage = 0; + marray[0] = m; + return 1; + } +*/ + + /* + * try to do any readahead that we might have free pages for. + */ + rahead = raheada; + if (rahead > (cnt.v_free_count - cnt.v_free_reserved)) { + rahead = cnt.v_free_count - cnt.v_free_reserved; + rbehind = 0; + } + + if (cnt.v_free_count < cnt.v_free_min) { + if (rahead > VM_FAULT_READ_AHEAD_MIN) + rahead = VM_FAULT_READ_AHEAD_MIN; + rbehind = 0; + } + + /* + * if we don't have any free pages, then just read one page. + */ + if (rahead <= 0) { + *reqpage = 0; + marray[0] = m; + return 1; + } + + /* + * scan backward for the read behind pages -- + * in memory or on disk not in same object + */ + toffset = offset - NBPG; + if( rbehind*NBPG > offset) + rbehind = offset / NBPG; + startoffset = offset - rbehind*NBPG; + while (toffset >= startoffset) { + if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) || + rtm != 0 || rtobject != object) { + startoffset = toffset + NBPG; + break; + } + if( toffset == 0) + break; + toffset -= NBPG; + } + + /* + * scan forward for the read ahead pages -- + * in memory or on disk not in same object + */ + toffset = offset + NBPG; + endoffset = offset + (rahead+1)*NBPG; + while (toffset < object->size && toffset < endoffset) { + if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) || + rtm != 0 || rtobject != object) { + break; + } + toffset += NBPG; + } + endoffset = toffset; + /* calculate number of bytes of pages */ + size = (endoffset - startoffset) / NBPG; + + /* calculate the page offset of the required page */ + treqpage = (offset - startoffset) / NBPG; + + /* see if we have space (again) */ + if (cnt.v_free_count >= cnt.v_free_reserved + size) { + bzero(marray, (rahead + rbehind + 1) * sizeof(vm_page_t)); + /* + * get our pages and don't block for them + */ + for (i = 0; i < size; i++) { + if (i != treqpage) + rtm = vm_page_alloc(object, startoffset + i * NBPG); + else + rtm = m; + marray[i] = rtm; + } + + for (i = 0; i < size; i++) { + if (marray[i] == 0) + break; + } + + /* + * if we could not get our block of pages, then + * free the readahead/readbehind pages. + */ + if (i < size) { + for (i = 0; i < size; i++) { + if (i != treqpage && marray[i]) + FREE_PAGE(marray[i]); + } + *reqpage = 0; + marray[0] = m; + return 1; + } + + *reqpage = treqpage; + return size; + } + *reqpage = 0; + marray[0] = m; + return 1; } + diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 5676ff3..f181ab0 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -67,16 +67,22 @@ #include <sys/buf.h> #include <sys/user.h> +#include <sys/kernel.h> +#include <sys/dkstat.h> + #include <vm/vm.h> #include <vm/vm_page.h> +#include <vm/vm_pageout.h> #include <vm/vm_kern.h> -#include <machine/cpu.h> +#include <machine/stdarg.h> +extern char kstack[]; int avefree = 0; /* XXX */ -unsigned maxdmap = MAXDSIZ; /* XXX */ int readbuffers = 0; /* XXX allow kgdb to read kernel buffer pool */ +/* vm_map_t upages_map; */ +void swapout(struct proc *p); int kernacc(addr, len, rw) caddr_t addr; @@ -89,18 +95,6 @@ kernacc(addr, len, rw) saddr = trunc_page(addr); eaddr = round_page(addr+len); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); - /* - * XXX there are still some things (e.g. the buffer cache) that - * are managed behind the VM system's back so even though an - * address is accessible in the mind of the VM system, there may - * not be physical pages where the VM thinks there is. This can - * lead to bogus allocation of pages in the kernel address space - * or worse, inconsistencies at the pmap level. We only worry - * about the buffer cache for now. - */ - if (!readbuffers && rv && (eaddr > (vm_offset_t)buffers && - saddr < (vm_offset_t)buffers + MAXBSIZE * nbuf)) - rv = FALSE; return(rv == TRUE); } @@ -112,6 +106,23 @@ useracc(addr, len, rw) boolean_t rv; vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + /* + * XXX - specially disallow access to user page tables - they are + * in the map. + * + * XXX - don't specially disallow access to the user area - treat + * it as incorrectly as elsewhere. + * + * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was + * only used (as an end address) in trap.c. Use it as an end + * address here too. + */ + if ((vm_offset_t) addr >= VM_MAXUSER_ADDRESS + || (vm_offset_t) addr + len > VM_MAXUSER_ADDRESS + || (vm_offset_t) addr + len <= (vm_offset_t) addr) { + return (FALSE); + } + rv = vm_map_check_protection(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr+len), prot); return(rv == TRUE); @@ -121,40 +132,18 @@ useracc(addr, len, rw) /* * Change protections on kernel pages from addr to addr+len * (presumably so debugger can plant a breakpoint). - * - * We force the protection change at the pmap level. If we were - * to use vm_map_protect a change to allow writing would be lazily- - * applied meaning we would still take a protection fault, something - * we really don't want to do. It would also fragment the kernel - * map unnecessarily. We cannot use pmap_protect since it also won't - * enforce a write-enable request. Using pmap_enter is the only way - * we can ensure the change takes place properly. + * All addresses are assumed to reside in the Sysmap, */ -void chgkprot(addr, len, rw) register caddr_t addr; int len, rw; { - vm_prot_t prot; - vm_offset_t pa, sva, eva; - - prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE; - eva = round_page(addr + len); - for (sva = trunc_page(addr); sva < eva; sva += PAGE_SIZE) { - /* - * Extract physical address for the page. - * We use a cheezy hack to differentiate physical - * page 0 from an invalid mapping, not that it - * really matters... - */ - pa = pmap_extract(kernel_pmap, sva|1); - if (pa == 0) - panic("chgkprot: invalid page"); - pmap_enter(kernel_pmap, sva, pa&~1, prot, TRUE); - } + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + + vm_map_protect(kernel_map, trunc_page(addr), + round_page(addr+len), prot, FALSE); } #endif - void vslock(addr, len) caddr_t addr; @@ -172,8 +161,8 @@ vsunlock(addr, len, dirtied) { #ifdef lint dirtied++; -#endif - vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), +#endif lint + vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr+len), TRUE); } @@ -194,16 +183,19 @@ vm_fork(p1, p2, isvfork) int isvfork; { register struct user *up; - vm_offset_t addr; + vm_offset_t addr, ptaddr; + int i; + struct vm_map *vp; + + while( cnt.v_free_count < cnt.v_free_min) + VM_WAIT; -#ifdef i386 /* * avoid copying any of the parent's pagetables or other per-process * objects that reside in the map by marking all of them non-inheritable */ (void)vm_map_inherit(&p1->p_vmspace->vm_map, - UPT_MIN_ADDRESS-UPAGES*NBPG, VM_MAX_ADDRESS, VM_INHERIT_NONE); -#endif + UPT_MIN_ADDRESS - UPAGES * NBPG, VM_MAX_ADDRESS, VM_INHERIT_NONE); p2->p_vmspace = vmspace_fork(p1->p_vmspace); #ifdef SYSVSHM @@ -211,23 +203,40 @@ vm_fork(p1, p2, isvfork) shmfork(p1, p2, isvfork); #endif -#ifndef i386 /* * Allocate a wired-down (for now) pcb and kernel stack for the process */ - addr = kmem_alloc_pageable(kernel_map, ctob(UPAGES)); - if (addr == 0) - panic("vm_fork: no more kernel virtual memory"); - vm_map_pageable(kernel_map, addr, addr + ctob(UPAGES), FALSE); -#else -/* XXX somehow, on 386, ocassionally pageout removes active, wired down kstack, -and pagetables, WITHOUT going thru vm_page_unwire! Why this appears to work is -not yet clear, yet it does... */ - addr = kmem_alloc(kernel_map, ctob(UPAGES)); - if (addr == 0) - panic("vm_fork: no more kernel virtual memory"); -#endif - up = (struct user *)addr; + + addr = (vm_offset_t) kstack; + + vp = &p2->p_vmspace->vm_map; + + /* ream out old pagetables and kernel stack */ + (void)vm_deallocate(vp, addr, UPT_MAX_ADDRESS - addr); + + /* get new pagetables and kernel stack */ + (void)vm_allocate(vp, &addr, UPT_MAX_ADDRESS - addr, FALSE); + + /* force in the page table encompassing the UPAGES */ + ptaddr = trunc_page((u_int)vtopte(addr)); + vm_map_pageable(vp, ptaddr, ptaddr + NBPG, FALSE); + + /* and force in (demand-zero) the UPAGES */ + vm_map_pageable(vp, addr, addr + UPAGES * NBPG, FALSE); + + /* get a kernel virtual address for the UPAGES for this proc */ + up = (struct user *)kmem_alloc_pageable(kernel_map, UPAGES * NBPG); + + /* and force-map the upages into the kernel pmap */ + for (i = 0; i < UPAGES; i++) + pmap_enter(vm_map_pmap(kernel_map), + ((vm_offset_t) up) + NBPG * i, + pmap_extract(vp->pmap, addr + NBPG * i), + VM_PROT_READ|VM_PROT_WRITE, 1); + + /* and allow the UPAGES page table entry to be paged (at the vm system level) */ + vm_map_pageable(vp, ptaddr, ptaddr + NBPG, TRUE); + p2->p_addr = up; /* @@ -246,15 +255,7 @@ not yet clear, yet it does... */ ((caddr_t)&up->u_stats.pstat_endcopy - (caddr_t)&up->u_stats.pstat_startcopy)); -#ifdef i386 - { u_int addr = UPT_MIN_ADDRESS - UPAGES*NBPG; struct vm_map *vp; - - vp = &p2->p_vmspace->vm_map; - (void)vm_deallocate(vp, addr, UPT_MAX_ADDRESS - addr); - (void)vm_allocate(vp, &addr, UPT_MAX_ADDRESS - addr, FALSE); - (void)vm_map_inherit(vp, addr, UPT_MAX_ADDRESS, VM_INHERIT_NONE); - } -#endif + /* * cpu_fork will copy and update the kernel stack and pcb, * and make the child ready to run. It marks the child @@ -273,6 +274,7 @@ void vm_init_limits(p) register struct proc *p; { + int tmp; /* * Set up the initial limits on process VM. @@ -285,11 +287,13 @@ vm_init_limits(p) p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; - p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(cnt.v_free_count); + tmp = ((2 * cnt.v_free_count) / 3) - 32; + if (cnt.v_free_count < 512) + tmp = cnt.v_free_count; + p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(tmp); + p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } -#include <vm/vm_pageout.h> - #ifdef DEBUG int enableswap = 1; int swapdebug = 0; @@ -298,12 +302,67 @@ int swapdebug = 0; #define SDB_SWAPOUT 4 #endif +void +faultin(p) +struct proc *p; +{ + vm_offset_t i; + vm_offset_t vaddr, ptaddr; + vm_offset_t v, v1; + struct user *up; + int s; + int opflag; + + if ((p->p_flag & P_INMEM) == 0) { + int rv0, rv1; + vm_map_t map; + + ++p->p_lock; + + map = &p->p_vmspace->vm_map; + /* force the page table encompassing the kernel stack (upages) */ + ptaddr = trunc_page((u_int)vtopte(kstack)); + vm_map_pageable(map, ptaddr, ptaddr + NBPG, FALSE); + + /* wire in the UPAGES */ + vm_map_pageable(map, (vm_offset_t) kstack, + (vm_offset_t) kstack + UPAGES * NBPG, FALSE); + + /* and map them nicely into the kernel pmap */ + for (i = 0; i < UPAGES; i++) { + vm_offset_t off = i * NBPG; + vm_offset_t pa = (vm_offset_t) + pmap_extract(&p->p_vmspace->vm_pmap, + (vm_offset_t) kstack + off); + pmap_enter(vm_map_pmap(kernel_map), + ((vm_offset_t)p->p_addr) + off, + pa, VM_PROT_READ|VM_PROT_WRITE, 1); + } + + /* and let the page table pages go (at least above pmap level) */ + vm_map_pageable(map, ptaddr, ptaddr + NBPG, TRUE); + + s = splhigh(); + + if (p->p_stat == SRUN) + setrunqueue(p); + + p->p_flag |= P_INMEM; + + /* undo the effect of setting SLOCK above */ + --p->p_lock; + splx(s); + + } + +} + +int swapinreq; +int percentactive; /* - * Brutally simple: - * 1. Attempt to swapin every swaped-out, runnable process in - * order of priority. - * 2. If not enough memory, wake the pageout daemon and let it - * clear some space. + * This swapin algorithm attempts to swap-in processes only if there + * is enough space for them. Of course, if a process waits for a long + * time, it will be swapped in anyway. */ void scheduler() @@ -313,88 +372,104 @@ scheduler() struct proc *pp; int ppri; vm_offset_t addr; - vm_size_t size; + int lastidle, lastrun; + int curidle, currun; + int forceload; + int percent; + int ntries; + + lastidle = 0; + lastrun = 0; loop: -#ifdef DEBUG - while (!enableswap) - sleep((caddr_t)&proc0, PVM); -#endif + ntries = 0; + vmmeter(); + + curidle = cp_time[CP_IDLE]; + currun = cp_time[CP_USER] + cp_time[CP_SYS] + cp_time[CP_NICE]; + percent = (100*(currun-lastrun)) / ( 1 + (currun-lastrun) + (curidle-lastidle)); + lastrun = currun; + lastidle = curidle; + if( percent > 100) + percent = 100; + percentactive = percent; + + if( percentactive < 25) + forceload = 1; + else + forceload = 0; + +loop1: pp = NULL; ppri = INT_MIN; for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { if (p->p_stat == SRUN && (p->p_flag & P_INMEM) == 0) { + int mempri; pri = p->p_swtime + p->p_slptime - p->p_nice * 8; - if (pri > ppri) { + mempri = pri > 0 ? pri : 0; + /* + * if this process is higher priority and there is + * enough space, then select this process instead + * of the previous selection. + */ + if (pri > ppri && + (((cnt.v_free_count + (mempri * (4*PAGE_SIZE) / PAGE_SIZE) >= (p->p_vmspace->vm_swrss)) || (ntries > 0 && forceload)))) { pp = p; ppri = pri; } } } -#ifdef DEBUG - if (swapdebug & SDB_FOLLOW) - printf("sched: running, procp %x pri %d\n", pp, ppri); -#endif + + if ((pp == NULL) && (ntries == 0) && forceload) { + ++ntries; + goto loop1; + } + /* * Nothing to do, back to sleep */ if ((p = pp) == NULL) { - sleep((caddr_t)&proc0, PVM); + tsleep((caddr_t)&proc0, PVM, "sched", 0); goto loop; } /* - * We would like to bring someone in. - * This part is really bogus cuz we could deadlock on memory - * despite our feeble check. + * We would like to bring someone in. (only if there is space). */ - size = round_page(ctob(UPAGES)); - addr = (vm_offset_t) p->p_addr; - if (cnt.v_free_count > atop(size)) { -#ifdef DEBUG - if (swapdebug & SDB_SWAPIN) - printf("swapin: pid %d(%s)@%x, pri %d free %d\n", - p->p_pid, p->p_comm, p->p_addr, - ppri, cnt.v_free_count); -#endif - vm_map_pageable(kernel_map, addr, addr+size, FALSE); - /* - * Some architectures need to be notified when the - * user area has moved to new physical page(s) (e.g. - * see pmax/pmax/vm_machdep.c). - */ - cpu_swapin(p); - (void) splstatclock(); - if (p->p_stat == SRUN) - setrunqueue(p); - p->p_flag |= P_INMEM; - (void) spl0(); +/* + printf("swapin: %d, free: %d, res: %d, min: %d\n", + p->p_pid, cnt.v_free_count, cnt.v_free_reserved, cnt.v_free_min); +*/ + (void) splhigh(); + if ((forceload && (cnt.v_free_count > (cnt.v_free_reserved + UPAGES + 1))) || + (cnt.v_free_count >= cnt.v_free_min)) { + spl0(); + faultin(p); p->p_swtime = 0; goto loop; - } + } + /* + * log the memory shortage + */ + swapinreq += p->p_vmspace->vm_swrss; /* * Not enough memory, jab the pageout daemon and wait til the * coast is clear. */ -#ifdef DEBUG - if (swapdebug & SDB_FOLLOW) - printf("sched: no room for pid %d(%s), free %d\n", - p->p_pid, p->p_comm, cnt.v_free_count); -#endif - (void) splhigh(); - VM_WAIT; + if( cnt.v_free_count < cnt.v_free_min) { + VM_WAIT; + } else { + tsleep((caddr_t)&proc0, PVM, "sched", 0); + } (void) spl0(); -#ifdef DEBUG - if (swapdebug & SDB_FOLLOW) - printf("sched: room again, free %d\n", cnt.v_free_count); -#endif goto loop; } -#define swappable(p) \ - (((p)->p_flag & \ - (P_SYSTEM | P_INMEM | P_NOSWAP | P_WEXIT | P_PHYSIO)) == P_INMEM) +#define swappable(p) \ + (((p)->p_lock == 0) && \ + ((p)->p_flag & (P_TRACED|P_NOSWAP|P_SYSTEM|P_INMEM|P_WEXIT|P_PHYSIO)) == P_INMEM) +extern int vm_pageout_free_min; /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and unwire their u-areas. We try to always "swap" at least one @@ -409,54 +484,86 @@ swapout_threads() register struct proc *p; struct proc *outp, *outp2; int outpri, outpri2; + int tpri; int didswap = 0; + int swapneeded = swapinreq; extern int maxslp; + int runnablenow; + int s; -#ifdef DEBUG - if (!enableswap) - return; -#endif +swapmore: + runnablenow = 0; outp = outp2 = NULL; - outpri = outpri2 = 0; + outpri = outpri2 = INT_MIN; for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { if (!swappable(p)) continue; switch (p->p_stat) { case SRUN: - if (p->p_swtime > outpri2) { + ++runnablenow; + /* + * count the process as being in a runnable state + */ + if ((tpri = p->p_swtime + p->p_nice * 8) > outpri2) { outp2 = p; - outpri2 = p->p_swtime; + outpri2 = tpri; } continue; case SSLEEP: case SSTOP: - if (p->p_slptime >= maxslp) { + /* + * do not swapout a process that is waiting for VM datastructures + * there is a possible deadlock. + */ + if (!lock_try_write( &p->p_vmspace->vm_map.lock)) { + continue; + } + vm_map_unlock( &p->p_vmspace->vm_map); + if (p->p_slptime > maxslp) { swapout(p); didswap++; - } else if (p->p_slptime > outpri) { + } else if ((tpri = p->p_slptime + p->p_nice * 8) > outpri) { outp = p; - outpri = p->p_slptime; + outpri = tpri ; } continue; } } /* - * If we didn't get rid of any real duds, toss out the next most - * likely sleeping/stopped or running candidate. We only do this - * if we are real low on memory since we don't gain much by doing - * it (UPAGES pages). + * We swapout only if there are more than two runnable processes or if + * another process needs some space to swapin. */ - if (didswap == 0 && - cnt.v_free_count <= atop(round_page(ctob(UPAGES)))) { - if ((p = outp) == 0) - p = outp2; -#ifdef DEBUG - if (swapdebug & SDB_SWAPOUT) - printf("swapout_threads: no duds, try procp %x\n", p); -#endif - if (p) + if ((swapinreq || ((percentactive > 90) && (runnablenow > 2))) && + (((cnt.v_free_count + cnt.v_inactive_count) <= (cnt.v_free_target + cnt.v_inactive_target)) || + (cnt.v_free_count < cnt.v_free_min))) { + if ((p = outp) == 0) { + p = outp2; + } + + if (p) { swapout(p); + didswap = 1; + } + } + + /* + * if we previously had found a process to swapout, and we need to swapout + * more then try again. + */ +#if 0 + if( p && swapinreq) + goto swapmore; +#endif + + /* + * If we swapped something out, and another process needed memory, + * then wakeup the sched process. + */ + if (didswap) { + if (swapneeded) + wakeup((caddr_t)&proc0); + swapinreq = 0; } } @@ -465,59 +572,37 @@ swapout(p) register struct proc *p; { vm_offset_t addr; - vm_size_t size; + struct pmap *pmap = &p->p_vmspace->vm_pmap; + vm_map_t map = &p->p_vmspace->vm_map; + vm_offset_t ptaddr; + int i; -#ifdef DEBUG - if (swapdebug & SDB_SWAPOUT) - printf("swapout: pid %d(%s)@%x, stat %x pri %d free %d\n", - p->p_pid, p->p_comm, p->p_addr, p->p_stat, - p->p_slptime, cnt.v_free_count); -#endif - size = round_page(ctob(UPAGES)); - addr = (vm_offset_t) p->p_addr; -#if defined(hp300) || defined(luna68k) + ++p->p_stats->p_ru.ru_nswap; /* - * Ugh! u-area is double mapped to a fixed address behind the - * back of the VM system and accesses are usually through that - * address rather than the per-process address. Hence reference - * and modify information are recorded at the fixed address and - * lost at context switch time. We assume the u-struct and - * kernel stack are always accessed/modified and force it to be so. + * remember the process resident count */ - { - register int i; - volatile long tmp; - - for (i = 0; i < UPAGES; i++) { - tmp = *(long *)addr; *(long *)addr = tmp; - addr += NBPG; - } - addr = (vm_offset_t) p->p_addr; - } -#endif -#ifdef mips + p->p_vmspace->vm_swrss = + p->p_vmspace->vm_pmap.pm_stats.resident_count; /* - * Be sure to save the floating point coprocessor state before - * paging out the u-struct. + * and decrement the amount of needed space */ - { - extern struct proc *machFPCurProcPtr; + swapinreq -= min(swapinreq, p->p_vmspace->vm_pmap.pm_stats.resident_count); - if (p == machFPCurProcPtr) { - MachSaveCurFPState(p); - machFPCurProcPtr = (struct proc *)0; - } - } -#endif -#ifndef i386 /* temporary measure till we find spontaineous unwire of kstack */ - vm_map_pageable(kernel_map, addr, addr+size, TRUE); - pmap_collect(vm_map_pmap(&p->p_vmspace->vm_map)); -#endif (void) splhigh(); p->p_flag &= ~P_INMEM; if (p->p_stat == SRUN) remrq(p); (void) spl0(); + + ++p->p_lock; +/* let the upages be paged */ + pmap_remove(vm_map_pmap(kernel_map), + (vm_offset_t) p->p_addr, ((vm_offset_t) p->p_addr) + UPAGES * NBPG); + + vm_map_pageable(map, (vm_offset_t) kstack, + (vm_offset_t) kstack + UPAGES * NBPG, TRUE); + + --p->p_lock; p->p_swtime = 0; } @@ -525,6 +610,7 @@ swapout(p) * The rest of these routines fake thread handling */ +#ifndef assert_wait void assert_wait(event, ruptible) int event; @@ -535,44 +621,38 @@ assert_wait(event, ruptible) #endif curproc->p_thread = event; } +#endif void -thread_block() +thread_block(char *msg) { - int s = splhigh(); - if (curproc->p_thread) - sleep((caddr_t)curproc->p_thread, PVM); - splx(s); + tsleep((caddr_t)curproc->p_thread, PVM, msg, 0); } + void -thread_sleep(event, lock, ruptible) +thread_sleep_(event, lock, wmesg) int event; simple_lock_t lock; - boolean_t ruptible; + char *wmesg; { -#ifdef lint - ruptible++; -#endif - int s = splhigh(); curproc->p_thread = event; simple_unlock(lock); - if (curproc->p_thread) - sleep((caddr_t)event, PVM); - splx(s); + if (curproc->p_thread) { + tsleep((caddr_t)event, PVM, wmesg, 0); + } } +#ifndef thread_wakeup void thread_wakeup(event) int event; { - int s = splhigh(); - wakeup((caddr_t)event); - splx(s); } +#endif /* * DEBUG stuff diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index 4874f9e..a0eac70 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -1,3 +1,4 @@ + /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. @@ -79,7 +80,8 @@ * The start and end address of physical memory is passed in. */ -void vm_mem_init() +void +vm_mem_init() { extern vm_offset_t avail_start, avail_end; extern vm_offset_t virtual_avail, virtual_end; @@ -89,9 +91,9 @@ void vm_mem_init() * From here on, all physical memory is accounted for, * and we use only virtual addresses. */ - vm_set_page_size(); - vm_page_startup(&avail_start, &avail_end); + vm_set_page_size(); + virtual_avail = vm_page_startup(avail_start, avail_end, virtual_avail); /* * Initialize other VM packages */ diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 7e4db63..55a0949 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -292,9 +292,13 @@ kmem_malloc(map, size, canwait) vm_map_lock(map); if (vm_map_findspace(map, 0, size, &addr)) { vm_map_unlock(map); +#if 0 if (canwait) /* XXX should wait */ panic("kmem_malloc: %s too small", map == kmem_map ? "kmem_map" : "mb_map"); +#endif + if (canwait) + panic("kmem_malloc: map too small"); return (0); } offset = addr - vm_map_min(kmem_map); @@ -404,7 +408,7 @@ vm_offset_t kmem_alloc_wait(map, size) } assert_wait((int)map, TRUE); vm_map_unlock(map); - thread_block(); + thread_block("kmaw"); } vm_map_insert(map, NULL, (vm_offset_t)0, addr, addr + size); vm_map_unlock(map); diff --git a/sys/vm/vm_kern.h b/sys/vm/vm_kern.h index d0d2c35..c032560 100644 --- a/sys/vm/vm_kern.h +++ b/sys/vm/vm_kern.h @@ -65,8 +65,10 @@ /* Kernel memory management definitions. */ vm_map_t buffer_map; -vm_map_t exec_map; vm_map_t kernel_map; vm_map_t kmem_map; vm_map_t mb_map; +vm_map_t io_map; +vm_map_t clean_map; +vm_map_t pager_map; vm_map_t phys_map; diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 425fe0d..ffffa96 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -73,6 +73,7 @@ #include <vm/vm.h> #include <vm/vm_page.h> #include <vm/vm_object.h> +#include <vm/vm_kern.h> /* * Virtual memory maps provide for the mapping, protection, @@ -137,6 +138,11 @@ vm_size_t kentry_data_size; vm_map_entry_t kentry_free; vm_map_t kmap_free; +int kentry_count; +vm_map_t kmap_free; +static vm_offset_t mapvm=0; +static int mapvmpgcnt=0; + static void _vm_map_clip_end __P((vm_map_t, vm_map_entry_t, vm_offset_t)); static void _vm_map_clip_start __P((vm_map_t, vm_map_entry_t, vm_offset_t)); @@ -273,27 +279,71 @@ vm_map_init(map, min, max, pageable) * Allocates a VM map entry for insertion. * No entry fields are filled in. This routine is */ -vm_map_entry_t vm_map_entry_create(map) +static struct vm_map_entry *mappool; +static int mappoolcnt; +void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); + +vm_map_entry_t +vm_map_entry_create(map) vm_map_t map; { vm_map_entry_t entry; -#ifdef DEBUG - extern vm_map_t kernel_map, kmem_map, mb_map, pager_map; - boolean_t isspecial; - - isspecial = (map == kernel_map || map == kmem_map || - map == mb_map || map == pager_map); - if (isspecial && map->entries_pageable || - !isspecial && !map->entries_pageable) - panic("vm_map_entry_create: bogus map"); -#endif - if (map->entries_pageable) { + int s; + int i; +#define KENTRY_LOW_WATER 64 +#define MAPENTRY_LOW_WATER 64 + + /* + * This is a *very* nasty (and sort of incomplete) hack!!!! + */ + if (kentry_count < KENTRY_LOW_WATER) { + if (mapvmpgcnt && mapvm) { + vm_page_t m; + if (m = vm_page_alloc(kmem_object, mapvm-vm_map_min(kmem_map))) { + int newentries; + newentries = (NBPG/sizeof (struct vm_map_entry)); + vm_page_wire(m); + m->flags &= ~PG_BUSY; + pmap_enter(vm_map_pmap(kmem_map), mapvm, + VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT, 1); + + entry = (vm_map_entry_t) mapvm; + mapvm += NBPG; + --mapvmpgcnt; + + for (i = 0; i < newentries; i++) { + vm_map_entry_dispose(kernel_map, entry); + entry++; + } + } + } + } + + if (map == kernel_map || map == kmem_map || map == pager_map) { + + if (entry = kentry_free) { + kentry_free = entry->next; + --kentry_count; + return entry; + } + + if (entry = mappool) { + mappool = entry->next; + --mappoolcnt; + return entry; + } + + } else { + if (entry = mappool) { + mappool = entry->next; + --mappoolcnt; + return entry; + } + MALLOC(entry, vm_map_entry_t, sizeof(struct vm_map_entry), M_VMMAPENT, M_WAITOK); - } else { - if (entry = kentry_free) - kentry_free = kentry_free->next; } +dopanic: if (entry == NULL) panic("vm_map_entry_create: out of map entries"); @@ -305,25 +355,28 @@ vm_map_entry_t vm_map_entry_create(map) * * Inverse of vm_map_entry_create. */ -void vm_map_entry_dispose(map, entry) +void +vm_map_entry_dispose(map, entry) vm_map_t map; vm_map_entry_t entry; { -#ifdef DEBUG - extern vm_map_t kernel_map, kmem_map, mb_map, pager_map; - boolean_t isspecial; - - isspecial = (map == kernel_map || map == kmem_map || - map == mb_map || map == pager_map); - if (isspecial && map->entries_pageable || - !isspecial && !map->entries_pageable) - panic("vm_map_entry_dispose: bogus map"); -#endif - if (map->entries_pageable) { - FREE(entry, M_VMMAPENT); - } else { + extern vm_map_t kernel_map, kmem_map, pager_map; + int s; + + if (map == kernel_map || map == kmem_map || map == pager_map || + kentry_count < KENTRY_LOW_WATER) { entry->next = kentry_free; kentry_free = entry; + ++kentry_count; + } else { + if (mappoolcnt < MAPENTRY_LOW_WATER) { + entry->next = mappool; + mappool = entry; + ++mappoolcnt; + return; + } + + FREE(entry, M_VMMAPENT); } } @@ -799,7 +852,7 @@ static void _vm_map_clip_start(map, entry, start) * See if we can simplify this entry first */ - vm_map_simplify_entry(map, entry); + /* vm_map_simplify_entry(map, entry); */ /* * Split off the front portion -- @@ -1130,7 +1183,7 @@ vm_map_pageable(map, start, end, new_pageable) { register vm_map_entry_t entry; vm_map_entry_t start_entry; - register vm_offset_t failed; + register vm_offset_t failed = 0; int rv; vm_map_lock(map); @@ -2546,11 +2599,13 @@ void vm_map_simplify(map, start) if (map->first_free == this_entry) map->first_free = prev_entry; - SAVE_HINT(map, prev_entry); - vm_map_entry_unlink(map, this_entry); - prev_entry->end = this_entry->end; - vm_object_deallocate(this_entry->object.vm_object); - vm_map_entry_dispose(map, this_entry); + if (!this_entry->object.vm_object->paging_in_progress) { + SAVE_HINT(map, prev_entry); + vm_map_entry_unlink(map, this_entry); + prev_entry->end = this_entry->end; + vm_object_deallocate(this_entry->object.vm_object); + vm_map_entry_dispose(map, this_entry); + } } vm_map_unlock(map); } diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index d25b7a2..ee253ef 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -102,11 +102,11 @@ struct vm_map_entry { vm_offset_t end; /* end address */ union vm_map_object object; /* object I point to */ vm_offset_t offset; /* offset into object */ - boolean_t is_a_map; /* Is "object" a map? */ - boolean_t is_sub_map; /* Is "object" a submap? */ + boolean_t is_a_map:1, /* Is "object" a map? */ + is_sub_map:1, /* Is "object" a submap? */ /* Only in sharing maps: */ - boolean_t copy_on_write; /* is data copy-on-write */ - boolean_t needs_copy; /* does object need to be copied */ + copy_on_write:1,/* is data copy-on-write */ + needs_copy:1; /* does object need to be copied */ /* Only in task maps: */ vm_prot_t protection; /* protection code */ vm_prot_t max_protection; /* maximum protection */ @@ -176,7 +176,7 @@ typedef struct { /* XXX: number of kernel maps and entries to statically allocate */ #define MAX_KMAP 10 -#define MAX_KMAPENT 500 +#define MAX_KMAPENT 128 #ifdef KERNEL boolean_t vm_map_check_protection __P((vm_map_t, diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 9db6f50..2a8029b 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -95,6 +95,7 @@ loadav(avg) /* * Attributes associated with virtual memory. */ +int vm_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) int *name; u_int namelen; diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 340cded..2e7204a 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -217,8 +217,10 @@ mmap(p, uap, retval) if (flags & MAP_FIXED) { if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) return (EINVAL); +#ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); +#endif if (addr > addr + size) return (EINVAL); } @@ -400,8 +402,10 @@ munmap(p, uap, retval) */ if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) return (EINVAL); +#ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); +#endif if (addr > addr + size) return (EINVAL); map = &p->p_vmspace->vm_map; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index d11fa8b..a6419dc 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -72,6 +72,12 @@ #include <vm/vm.h> #include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +static void _vm_object_allocate(vm_size_t, vm_object_t); +void vm_object_deactivate_pages(vm_object_t); +void vm_object_cache_trim(void); +void vm_object_remove(vm_pager_t); /* * Virtual memory objects maintain the actual data @@ -99,26 +105,56 @@ * */ + struct vm_object kernel_object_store; struct vm_object kmem_object_store; +extern int vm_cache_max; #define VM_OBJECT_HASH_COUNT 157 -int vm_cache_max = 100; /* can patch if necessary */ -struct vm_object_hash_head vm_object_hashtable[VM_OBJECT_HASH_COUNT]; +struct vm_object_hash_head vm_object_hashtable[VM_OBJECT_HASH_COUNT]; long object_collapses = 0; long object_bypasses = 0; -static void _vm_object_allocate __P((vm_size_t, vm_object_t)); +static void +_vm_object_allocate(size, object) + vm_size_t size; + register vm_object_t object; +{ + bzero(object, sizeof *object); + TAILQ_INIT(&object->memq); + vm_object_lock_init(object); + object->ref_count = 1; + object->resident_page_count = 0; + object->size = size; + object->flags = OBJ_INTERNAL; /* vm_allocate_with_pager will reset */ + object->paging_in_progress = 0; + object->copy = NULL; + + /* + * Object starts out read-write, with no pager. + */ + + object->pager = NULL; + object->paging_offset = 0; + object->shadow = NULL; + object->shadow_offset = (vm_offset_t) 0; + + simple_lock(&vm_object_list_lock); + TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); + vm_object_count++; + cnt.v_nzfod += atop(size); + simple_unlock(&vm_object_list_lock); +} /* * vm_object_init: * * Initialize the VM objects module. */ -void vm_object_init(size) - vm_size_t size; +void +vm_object_init(vm_offset_t nothing) { register int i; @@ -132,10 +168,12 @@ void vm_object_init(size) TAILQ_INIT(&vm_object_hashtable[i]); kernel_object = &kernel_object_store; - _vm_object_allocate(size, kernel_object); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + kernel_object); kmem_object = &kmem_object_store; - _vm_object_allocate(VM_KMEM_SIZE + VM_MBUF_SIZE, kmem_object); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + kmem_object); } /* @@ -144,55 +182,30 @@ void vm_object_init(size) * Returns a new object with the given size. */ -vm_object_t vm_object_allocate(size) +vm_object_t +vm_object_allocate(size) vm_size_t size; { register vm_object_t result; + int s; result = (vm_object_t) malloc((u_long)sizeof *result, M_VMOBJ, M_WAITOK); + _vm_object_allocate(size, result); return(result); } -static void -_vm_object_allocate(size, object) - vm_size_t size; - register vm_object_t object; -{ - TAILQ_INIT(&object->memq); - vm_object_lock_init(object); - object->ref_count = 1; - object->resident_page_count = 0; - object->size = size; - object->flags = OBJ_INTERNAL; /* vm_allocate_with_pager will reset */ - object->paging_in_progress = 0; - object->copy = NULL; - - /* - * Object starts out read-write, with no pager. - */ - - object->pager = NULL; - object->paging_offset = 0; - object->shadow = NULL; - object->shadow_offset = (vm_offset_t) 0; - - simple_lock(&vm_object_list_lock); - TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); - vm_object_count++; - cnt.v_nzfod += atop(size); - simple_unlock(&vm_object_list_lock); -} /* * vm_object_reference: * * Gets another reference to the given object. */ -void vm_object_reference(object) +inline void +vm_object_reference(object) register vm_object_t object; { if (object == NULL) @@ -214,8 +227,9 @@ void vm_object_reference(object) * * No object may be locked. */ -void vm_object_deallocate(object) - register vm_object_t object; +void +vm_object_deallocate(object) + vm_object_t object; { vm_object_t temp; @@ -235,11 +249,11 @@ void vm_object_deallocate(object) vm_object_lock(object); if (--(object->ref_count) != 0) { + vm_object_unlock(object); /* * If there are still references, then * we are done. */ - vm_object_unlock(object); vm_object_cache_unlock(); return; } @@ -257,7 +271,12 @@ void vm_object_deallocate(object) vm_object_cached++; vm_object_cache_unlock(); - vm_object_deactivate_pages(object); +/* + * this code segment was removed because it kills performance with + * large -- repetively used binaries. The functionality now resides + * in the pageout daemon + * vm_object_deactivate_pages(object); + */ vm_object_unlock(object); vm_object_cache_trim(); @@ -269,7 +288,7 @@ void vm_object_deallocate(object) */ vm_object_remove(object->pager); vm_object_cache_unlock(); - + temp = object->shadow; vm_object_terminate(object); /* unlocks and deallocates object */ @@ -277,18 +296,19 @@ void vm_object_deallocate(object) } } - /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. */ -void vm_object_terminate(object) +void +vm_object_terminate(object) register vm_object_t object; { register vm_page_t p; vm_object_t shadow_object; + int s; /* * Detach the object from its shadow if we are the shadow's @@ -298,28 +318,68 @@ void vm_object_terminate(object) vm_object_lock(shadow_object); if (shadow_object->copy == object) shadow_object->copy = NULL; -#if 0 +/* else if (shadow_object->copy != NULL) panic("vm_object_terminate: copy/shadow inconsistency"); -#endif +*/ vm_object_unlock(shadow_object); } /* - * Wait until the pageout daemon is through with the object. + * Wait until the pageout daemon is through + * with the object. */ + while (object->paging_in_progress) { vm_object_sleep((int)object, object, FALSE); vm_object_lock(object); } /* - * If not an internal object clean all the pages, removing them - * from paging queues as we go. + * While the paging system is locked, + * pull the object's pages off the active + * and inactive queues. This keeps the + * pageout daemon from playing with them + * during vm_pager_deallocate. * - * XXX need to do something in the event of a cleaning error. + * We can't free the pages yet, because the + * object's pager may have to write them out + * before deallocating the paging space. + */ + + for( p = object->memq.tqh_first; p; p=p->listq.tqe_next) { + VM_PAGE_CHECK(p); + + vm_page_lock_queues(); + s = splimp(); + if (p->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, p, pageq); + p->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + + if (p->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, p, pageq); + p->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + } + splx(s); + vm_page_unlock_queues(); + } + + vm_object_unlock(object); + + if (object->paging_in_progress != 0) + panic("vm_object_deallocate: pageout in progress"); + + /* + * Clean and free the pages, as appropriate. + * All references to the object are gone, + * so we don't need to lock it. */ + if ((object->flags & OBJ_INTERNAL) == 0) { + vm_object_lock(object); (void) vm_object_page_clean(object, 0, 0, TRUE, TRUE); vm_object_unlock(object); } @@ -335,23 +395,24 @@ void vm_object_terminate(object) cnt.v_pfree++; vm_page_unlock_queues(); } - if ((object->flags & OBJ_INTERNAL) == 0) - vm_object_unlock(object); /* - * Let the pager know object is dead. + * Let the pager know object is dead. */ + if (object->pager != NULL) vm_pager_deallocate(object->pager); + simple_lock(&vm_object_list_lock); TAILQ_REMOVE(&vm_object_list, object, object_list); vm_object_count--; simple_unlock(&vm_object_list_lock); /* - * Free the space for the object. + * Free the space for the object. */ + free((caddr_t)object, M_VMOBJ); } @@ -359,6 +420,69 @@ void vm_object_terminate(object) * vm_object_page_clean * * Clean all dirty pages in the specified range of object. + * Leaves page on whatever queue it is currently on. + * + * Odd semantics: if start == end, we clean everything. + * + * The object must be locked. + */ +#if 1 +boolean_t +vm_object_page_clean(object, start, end, syncio, de_queue) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; + boolean_t syncio; + boolean_t de_queue; +{ + register vm_page_t p, nextp; + int s; + int size; + + if (object->pager == NULL) + return 1; + + if (start != end) { + start = trunc_page(start); + end = round_page(end); + } + size = end - start; + +again: + /* + * Wait until the pageout daemon is through with the object. + */ + while (object->paging_in_progress) { + vm_object_sleep((int)object, object, FALSE); + } + + nextp = object->memq.tqh_first; + while ( (p = nextp) && ((start == end) || (size != 0) ) ) { + nextp = p->listq.tqe_next; + if (start == end || (p->offset >= start && p->offset < end)) { + if (p->flags & PG_BUSY) + continue; + + size -= PAGE_SIZE; + + if ((p->flags & PG_CLEAN) + && pmap_is_modified(VM_PAGE_TO_PHYS(p))) + p->flags &= ~PG_CLEAN; + + if ((p->flags & PG_CLEAN) == 0) { + vm_pageout_clean(p,VM_PAGEOUT_FORCE); + goto again; + } + } + } + wakeup((caddr_t)object); + return 1; +} +#endif +/* + * vm_object_page_clean + * + * Clean all dirty pages in the specified range of object. * If syncio is TRUE, page cleaning is done synchronously. * If de_queue is TRUE, pages are removed from any paging queue * they were on, otherwise they are left on whatever queue they @@ -372,6 +496,7 @@ void vm_object_terminate(object) * somewhere. We attempt to clean (and dequeue) all pages regardless * of where an error occurs. */ +#if 0 boolean_t vm_object_page_clean(object, start, end, syncio, de_queue) register vm_object_t object; @@ -421,6 +546,7 @@ again: * Loop through the object page list cleaning as necessary. */ for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + onqueue = 0; if ((start == end || p->offset >= start && p->offset < end) && !(p->flags & PG_FICTITIOUS)) { if ((p->flags & PG_CLEAN) && @@ -493,6 +619,7 @@ again: } return (noerror); } +#endif /* * vm_object_deactivate_pages @@ -539,6 +666,7 @@ vm_object_cache_trim() vm_object_cache_unlock(); } + /* * vm_object_pmap_copy: * @@ -576,7 +704,8 @@ void vm_object_pmap_copy(object, start, end) * * The object must *not* be locked. */ -void vm_object_pmap_remove(object, start, end) +void +vm_object_pmap_remove(object, start, end) register vm_object_t object; register vm_offset_t start; register vm_offset_t end; @@ -587,9 +716,19 @@ void vm_object_pmap_remove(object, start, end) return; vm_object_lock(object); - for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) - if ((start <= p->offset) && (p->offset < end)) +again: + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + if ((start <= p->offset) && (p->offset < end)) { + if (p->flags & PG_BUSY) { + p->flags |= PG_WANTED; + tsleep((caddr_t) p, PVM, "vmopmr", 0); + goto again; + } pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + if ((p->flags & PG_CLEAN) == 0) + p->flags |= PG_LAUNDRY; + } + } vm_object_unlock(object); } @@ -629,6 +768,7 @@ void vm_object_copy(src_object, src_offset, size, return; } + /* * If the object's pager is null_pager or the * default pager, we don't have to make a copy @@ -637,7 +777,15 @@ void vm_object_copy(src_object, src_offset, size, */ vm_object_lock(src_object); + + /* + * Try to collapse the object before copying it. + */ + + vm_object_collapse(src_object); + if (src_object->pager == NULL || + src_object->pager->pg_type == PG_SWAP || (src_object->flags & OBJ_INTERNAL)) { /* @@ -664,10 +812,6 @@ void vm_object_copy(src_object, src_offset, size, return; } - /* - * Try to collapse the object before copying it. - */ - vm_object_collapse(src_object); /* * If the object has a pager, the pager wants to @@ -798,7 +942,8 @@ void vm_object_copy(src_object, src_offset, size, * are returned in the source parameters. */ -void vm_object_shadow(object, offset, length) +void +vm_object_shadow(object, offset, length) vm_object_t *object; /* IN/OUT */ vm_offset_t *offset; /* IN/OUT */ vm_size_t length; @@ -843,7 +988,8 @@ void vm_object_shadow(object, offset, length) * Set the specified object's pager to the specified pager. */ -void vm_object_setpager(object, pager, paging_offset, +void +vm_object_setpager(object, pager, paging_offset, read_only) vm_object_t object; vm_pager_t pager; @@ -852,9 +998,12 @@ void vm_object_setpager(object, pager, paging_offset, { #ifdef lint read_only++; /* No longer used */ -#endif +#endif lint vm_object_lock(object); /* XXX ? */ + if (object->pager && object->pager != pager) { + panic("!!!pager already allocated!!!\n"); + } object->pager = pager; object->paging_offset = paging_offset; vm_object_unlock(object); /* XXX ? */ @@ -865,7 +1014,7 @@ void vm_object_setpager(object, pager, paging_offset, */ #define vm_object_hash(pager) \ - (((unsigned)pager)%VM_OBJECT_HASH_COUNT) + (((unsigned)pager >> 5)%VM_OBJECT_HASH_COUNT) /* * vm_object_lookup looks in the object cache for an object with the @@ -965,38 +1114,6 @@ vm_object_remove(pager) } } -/* - * vm_object_cache_clear removes all objects from the cache. - * - */ - -void vm_object_cache_clear() -{ - register vm_object_t object; - - /* - * Remove each object in the cache by scanning down the - * list of cached objects. - */ - vm_object_cache_lock(); - while ((object = vm_object_cached_list.tqh_first) != NULL) { - vm_object_cache_unlock(); - - /* - * Note: it is important that we use vm_object_lookup - * to gain a reference, and not vm_object_reference, because - * the logic for removing an object from the cache lies in - * lookup. - */ - if (object != vm_object_lookup(object->pager)) - panic("vm_object_cache_clear: I'm sooo confused."); - pager_cache(object, FALSE); - - vm_object_cache_lock(); - } - vm_object_cache_unlock(); -} - boolean_t vm_object_collapse_allowed = TRUE; /* * vm_object_collapse: @@ -1008,8 +1125,12 @@ boolean_t vm_object_collapse_allowed = TRUE; * Requires that the object be locked and the page * queues be unlocked. * + * This routine has significant changes by John S. Dyson + * to fix some swap memory leaks. 18 Dec 93 + * */ -void vm_object_collapse(object) +void +vm_object_collapse(object) register vm_object_t object; { @@ -1027,11 +1148,10 @@ void vm_object_collapse(object) * Verify that the conditions are right for collapse: * * The object exists and no pages in it are currently - * being paged out (or have ever been paged out). + * being paged out. */ if (object == NULL || - object->paging_in_progress != 0 || - object->pager != NULL) + object->paging_in_progress != 0) return; /* @@ -1067,12 +1187,24 @@ void vm_object_collapse(object) * parent object. */ if (backing_object->shadow != NULL && - backing_object->shadow->copy != NULL) { + backing_object->shadow->copy == backing_object) { vm_object_unlock(backing_object); return; } /* + * we can deal only with the swap pager + */ + if ((object->pager && + object->pager->pg_type != PG_SWAP) || + (backing_object->pager && + backing_object->pager->pg_type != PG_SWAP)) { + vm_object_unlock(backing_object); + return; + } + + + /* * We know that we can either collapse the backing * object (if the parent is the only reference to * it) or (perhaps) remove the parent's reference @@ -1098,7 +1230,8 @@ void vm_object_collapse(object) * pages that shadow them. */ - while ((p = backing_object->memq.tqh_first) != NULL) { + while (p = backing_object->memq.tqh_first) { + new_offset = (p->offset - backing_offset); /* @@ -1116,19 +1249,12 @@ void vm_object_collapse(object) vm_page_unlock_queues(); } else { pp = vm_page_lookup(object, new_offset); - if (pp != NULL && !(pp->flags & PG_FAKE)) { + if (pp != NULL || (object->pager && vm_pager_has_page(object->pager, + object->paging_offset + new_offset))) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); - } - else { - if (pp) { - /* may be someone waiting for it */ - PAGE_WAKEUP(pp); - vm_page_lock_queues(); - vm_page_free(pp); - vm_page_unlock_queues(); - } + } else { vm_page_rename(p, object, new_offset); } } @@ -1136,19 +1262,50 @@ void vm_object_collapse(object) /* * Move the pager from backing_object to object. - * - * XXX We're only using part of the paging space - * for keeps now... we ought to discard the - * unused portion. */ if (backing_object->pager) { - object->pager = backing_object->pager; - object->paging_offset = backing_offset + - backing_object->paging_offset; - backing_object->pager = NULL; + backing_object->paging_in_progress++; + if (object->pager) { + vm_pager_t bopager; + object->paging_in_progress++; + /* + * copy shadow object pages into ours + * and destroy unneeded pages in shadow object. + */ + bopager = backing_object->pager; + backing_object->pager = NULL; + vm_object_remove(backing_object->pager); + swap_pager_copy( + bopager, backing_object->paging_offset, + object->pager, object->paging_offset, + object->shadow_offset); + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t)object); + } else { + object->paging_in_progress++; + /* + * grab the shadow objects pager + */ + object->pager = backing_object->pager; + object->paging_offset = backing_object->paging_offset + backing_offset; + vm_object_remove(backing_object->pager); + backing_object->pager = NULL; + /* + * free unnecessary blocks + */ + swap_pager_freespace(object->pager, 0, object->paging_offset); + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t)object); + } + backing_object->paging_in_progress--; + if (backing_object->paging_in_progress == 0) + wakeup((caddr_t)backing_object); } + /* * Object now shadows whatever backing_object did. * Note that the reference to backing_object->shadow @@ -1173,7 +1330,7 @@ void vm_object_collapse(object) simple_lock(&vm_object_list_lock); TAILQ_REMOVE(&vm_object_list, backing_object, - object_list); + object_list); vm_object_count--; simple_unlock(&vm_object_list_lock); @@ -1204,9 +1361,7 @@ void vm_object_collapse(object) * of pages here. */ - for (p = backing_object->memq.tqh_first; - p != NULL; - p = p->listq.tqe_next) { + for( p = backing_object->memq.tqh_first;p;p=p->listq.tqe_next) { new_offset = (p->offset - backing_offset); /* @@ -1219,10 +1374,9 @@ void vm_object_collapse(object) */ if (p->offset >= backing_offset && - new_offset < size && - ((pp = vm_page_lookup(object, new_offset)) - == NULL || - (pp->flags & PG_FAKE))) { + new_offset <= size && + ((pp = vm_page_lookup(object, new_offset)) == NULL || (pp->flags & PG_FAKE)) && + (!object->pager || !vm_pager_has_page(object->pager, object->paging_offset+new_offset))) { /* * Page still needed. * Can't go any further. @@ -1239,23 +1393,24 @@ void vm_object_collapse(object) * count is at least 2. */ - object->shadow = backing_object->shadow; - vm_object_reference(object->shadow); + vm_object_reference(object->shadow = backing_object->shadow); object->shadow_offset += backing_object->shadow_offset; /* - * Backing object might have had a copy pointer - * to us. If it did, clear it. + * Backing object might have had a copy pointer + * to us. If it did, clear it. */ if (backing_object->copy == object) { backing_object->copy = NULL; } - + /* Drop the reference count on backing_object. * Since its ref_count was at least 2, it * will not vanish; so we don't need to call * vm_object_deallocate. */ + if (backing_object->ref_count == 1) + printf("should have called obj deallocate\n"); backing_object->ref_count--; vm_object_unlock(backing_object); @@ -1277,23 +1432,55 @@ void vm_object_collapse(object) * * The object must be locked. */ -void vm_object_page_remove(object, start, end) +void +vm_object_page_remove(object, start, end) register vm_object_t object; register vm_offset_t start; register vm_offset_t end; { register vm_page_t p, next; + vm_offset_t size; + int cnt; + int s; if (object == NULL) return; - for (p = object->memq.tqh_first; p != NULL; p = next) { - next = p->listq.tqe_next; - if ((start <= p->offset) && (p->offset < end)) { - pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); - vm_page_lock_queues(); - vm_page_free(p); - vm_page_unlock_queues(); + start = trunc_page(start); + end = round_page(end); +again: + size = end-start; + if (size > 4*PAGE_SIZE || size >= object->size/4) { + for (p = object->memq.tqh_first; (p != NULL && size > 0); p = next) { + next = p->listq.tqe_next; + if ((start <= p->offset) && (p->offset < end)) { + if (p->flags & PG_BUSY) { + p->flags |= PG_WANTED; + tsleep((caddr_t) p, PVM, "vmopar", 0); + goto again; + } + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + size -= PAGE_SIZE; + } + } + } else { + while (size > 0) { + while (p = vm_page_lookup(object, start)) { + if (p->flags & PG_BUSY) { + p->flags |= PG_WANTED; + tsleep((caddr_t) p, PVM, "vmopar", 0); + goto again; + } + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } + start += PAGE_SIZE; + size -= PAGE_SIZE; } } } @@ -1389,6 +1576,27 @@ boolean_t vm_object_coalesce(prev_object, next_object, } /* + * returns page after looking up in shadow chain + */ + +vm_page_t +vm_object_page_lookup(object, offset) + vm_object_t object; + vm_offset_t offset; +{ + vm_page_t m; + if (!(m=vm_page_lookup(object, offset))) { + if (!object->shadow) + return 0; + else + return vm_object_page_lookup(object->shadow, offset + object->shadow_offset); + } + return m; +} + +#define DEBUG +#if defined(DEBUG) || (NDDB > 0) +/* * vm_object_print: [ debug ] */ void vm_object_print(object, full) @@ -1434,3 +1642,4 @@ void vm_object_print(object, full) printf("\n"); indent -= 2; } +#endif /* defined(DEBUG) || (NDDB > 0) */ diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 0cd9d87..38d320f 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1,6 +1,6 @@ /* - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. @@ -33,9 +33,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 - * - * + * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 + * $Id: vm_page.c,v 1.17 1994/04/20 07:07:14 davidg Exp $ + */ + +/* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * @@ -68,6 +70,7 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/proc.h> #include <vm/vm.h> #include <vm/vm_page.h> @@ -123,7 +126,6 @@ void vm_set_page_size() break; } - /* * vm_page_startup: * @@ -133,17 +135,55 @@ void vm_set_page_size() * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ -void vm_page_startup(start, end) - vm_offset_t *start; - vm_offset_t *end; + +vm_offset_t +vm_page_startup(starta, enda, vaddr) + register vm_offset_t starta; + vm_offset_t enda; + register vm_offset_t vaddr; { + register vm_offset_t mapped; register vm_page_t m; - register struct pglist *bucket; - vm_size_t npages; + register struct pglist *bucket; + vm_size_t npages, page_range; + register vm_offset_t new_start; int i; vm_offset_t pa; + int nblocks; + vm_offset_t first_managed_page; + int size; + extern vm_offset_t kentry_data; extern vm_size_t kentry_data_size; + extern vm_offset_t phys_avail[]; +/* the biggest memory array is the second group of pages */ + vm_offset_t start; + vm_offset_t biggestone, biggestsize; + + vm_offset_t total; + + total = 0; + biggestsize = 0; + biggestone = 0; + nblocks = 0; + vaddr = round_page(vaddr); + + for (i = 0; phys_avail[i + 1]; i += 2) { + phys_avail[i] = round_page(phys_avail[i]); + phys_avail[i+1] = trunc_page(phys_avail[i+1]); + } + + for (i = 0; phys_avail[i + 1]; i += 2) { + int size = phys_avail[i+1] - phys_avail[i]; + if (size > biggestsize) { + biggestone = i; + biggestsize = size; + } + ++nblocks; + total += size; + } + + start = phys_avail[biggestone]; /* @@ -163,7 +203,7 @@ void vm_page_startup(start, end) TAILQ_INIT(&vm_page_queue_inactive); /* - * Calculate the number of hash table buckets. + * Allocate (and initialize) the hash table buckets. * * The number of buckets MUST BE a power of 2, and * the actual value is the next power of 2 greater @@ -172,23 +212,31 @@ void vm_page_startup(start, end) * Note: * This computation can be tweaked if desired. */ - + vm_page_buckets = (struct pglist *)vaddr; + bucket = vm_page_buckets; if (vm_page_bucket_count == 0) { vm_page_bucket_count = 1; - while (vm_page_bucket_count < atop(*end - *start)) + while (vm_page_bucket_count < atop(total)) vm_page_bucket_count <<= 1; } + vm_page_hash_mask = vm_page_bucket_count - 1; /* - * Allocate (and initialize) the hash table buckets. + * Validate these addresses. */ - vm_page_buckets = (struct pglist *) - pmap_bootstrap_alloc(vm_page_bucket_count * sizeof(struct pglist)); - bucket = vm_page_buckets; - for (i = vm_page_bucket_count; i--;) { + new_start = start + vm_page_bucket_count * sizeof(struct pglist); + new_start = round_page(new_start); + mapped = vaddr; + vaddr = pmap_map(mapped, start, new_start, + VM_PROT_READ|VM_PROT_WRITE); + start = new_start; + bzero((caddr_t) mapped, vaddr - mapped); + mapped = vaddr; + + for (i = 0; i< vm_page_bucket_count; i++) { TAILQ_INIT(bucket); bucket++; } @@ -196,11 +244,9 @@ void vm_page_startup(start, end) simple_lock_init(&bucket_lock); /* - * Truncate the remainder of physical memory to our page size. + * round (or truncate) the addresses to our page size. */ - *end = trunc_page(*end); - /* * Pre-allocate maps and map entries that cannot be dynamically * allocated via malloc(). The maps include the kernel_map and @@ -213,9 +259,20 @@ void vm_page_startup(start, end) * map (they should use their own maps). */ - kentry_data_size = round_page(MAX_KMAP*sizeof(struct vm_map) + - MAX_KMAPENT*sizeof(struct vm_map_entry)); - kentry_data = (vm_offset_t) pmap_bootstrap_alloc(kentry_data_size); + kentry_data_size = MAX_KMAP * sizeof(struct vm_map) + + MAX_KMAPENT * sizeof(struct vm_map_entry); + kentry_data_size = round_page(kentry_data_size); + kentry_data = (vm_offset_t) vaddr; + vaddr += kentry_data_size; + + /* + * Validate these zone addresses. + */ + + new_start = start + (vaddr - mapped); + pmap_map(mapped, start, new_start, VM_PROT_READ|VM_PROT_WRITE); + bzero((caddr_t) mapped, (vaddr - mapped)); + start = round_page(new_start); /* * Compute the number of pages of memory that will be @@ -223,53 +280,53 @@ void vm_page_startup(start, end) * of a page structure per page). */ - cnt.v_free_count = npages = (*end - *start + sizeof(struct vm_page)) - / (PAGE_SIZE + sizeof(struct vm_page)); + npages = (total - (start - phys_avail[biggestone])) / (PAGE_SIZE + sizeof(struct vm_page)); + first_page = phys_avail[0] / PAGE_SIZE; + page_range = (phys_avail[(nblocks-1)*2 + 1] - phys_avail[0]) / PAGE_SIZE; /* - * Record the extent of physical memory that the - * virtual memory system manages. + * Initialize the mem entry structures now, and + * put them in the free queue. */ - first_page = *start; - first_page += npages*sizeof(struct vm_page); - first_page = atop(round_page(first_page)); - last_page = first_page + npages - 1; - - first_phys_addr = ptoa(first_page); - last_phys_addr = ptoa(last_page) + PAGE_MASK; + vm_page_array = (vm_page_t) vaddr; + mapped = vaddr; /* - * Allocate and clear the mem entry structures. + * Validate these addresses. */ - m = vm_page_array = (vm_page_t) - pmap_bootstrap_alloc(npages * sizeof(struct vm_page)); + new_start = round_page(start + page_range * sizeof (struct vm_page)); + mapped = pmap_map(mapped, start, new_start, + VM_PROT_READ|VM_PROT_WRITE); + start = new_start; + + first_managed_page = start / PAGE_SIZE; /* - * Initialize the mem entry structures now, and - * put them in the free queue. + * Clear all of the page structures */ + bzero((caddr_t)vm_page_array, page_range * sizeof(struct vm_page)); - pa = first_phys_addr; - while (npages--) { - m->flags = 0; - m->object = NULL; - m->phys_addr = pa; -#ifdef i386 - if (pmap_isvalidphys(m->phys_addr)) { + cnt.v_page_count = 0; + cnt.v_free_count= 0; + for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { + if (i == biggestone) + pa = ptoa(first_managed_page); + else + pa = phys_avail[i]; + while (pa < phys_avail[i + 1] && npages-- > 0) { + ++cnt.v_page_count; + ++cnt.v_free_count; + m = PHYS_TO_VM_PAGE(pa); + m->flags = 0; + m->object = 0; + m->phys_addr = pa; + m->hold_count = 0; TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); - } else { - /* perhaps iomem needs it's own type, or dev pager? */ - m->flags |= PG_FICTITIOUS | PG_BUSY; - cnt.v_free_count--; + pa += PAGE_SIZE; } -#else /* i386 */ - TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); -#endif /* i386 */ - m++; - pa += PAGE_SIZE; } /* @@ -278,8 +335,7 @@ void vm_page_startup(start, end) */ simple_lock_init(&vm_pages_needed_lock); - /* from now on, pmap_bootstrap_alloc can't be used */ - vm_page_startup_initialized = TRUE; + return(mapped); } /* @@ -289,8 +345,13 @@ void vm_page_startup(start, end) * * NOTE: This macro depends on vm_page_bucket_count being a power of 2. */ -#define vm_page_hash(object, offset) \ - (((unsigned)object+(unsigned)atop(offset))&vm_page_hash_mask) +inline const int +vm_page_hash(object, offset) + vm_object_t object; + vm_offset_t offset; +{ + return ((unsigned)object + offset/NBPG) & vm_page_hash_mask; +} /* * vm_page_insert: [ internal use only ] @@ -307,7 +368,7 @@ void vm_page_insert(mem, object, offset) register vm_offset_t offset; { register struct pglist *bucket; - int spl; + int s; VM_PAGE_CHECK(mem); @@ -326,11 +387,11 @@ void vm_page_insert(mem, object, offset) */ bucket = &vm_page_buckets[vm_page_hash(object, offset)]; - spl = splimp(); + s = splimp(); simple_lock(&bucket_lock); TAILQ_INSERT_TAIL(bucket, mem, hashq); simple_unlock(&bucket_lock); - (void) splx(spl); + (void) splx(s); /* * Now link into the object's list of backed pages. @@ -361,7 +422,7 @@ void vm_page_remove(mem) register vm_page_t mem; { register struct pglist *bucket; - int spl; + int s; VM_PAGE_CHECK(mem); @@ -373,11 +434,11 @@ void vm_page_remove(mem) */ bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->offset)]; - spl = splimp(); + s = splimp(); simple_lock(&bucket_lock); TAILQ_REMOVE(bucket, mem, hashq); simple_unlock(&bucket_lock); - (void) splx(spl); + (void) splx(s); /* * Now remove from the object's list of backed pages. @@ -410,7 +471,7 @@ vm_page_t vm_page_lookup(object, offset) { register vm_page_t mem; register struct pglist *bucket; - int spl; + int s; /* * Search the hash table for this object/offset pair @@ -418,19 +479,19 @@ vm_page_t vm_page_lookup(object, offset) bucket = &vm_page_buckets[vm_page_hash(object, offset)]; - spl = splimp(); + s = splimp(); simple_lock(&bucket_lock); for (mem = bucket->tqh_first; mem != NULL; mem = mem->hashq.tqe_next) { VM_PAGE_CHECK(mem); if ((mem->object == object) && (mem->offset == offset)) { simple_unlock(&bucket_lock); - splx(spl); + splx(s); return(mem); } } simple_unlock(&bucket_lock); - splx(spl); + splx(s); return(NULL); } @@ -465,46 +526,62 @@ void vm_page_rename(mem, new_object, new_offset) * * Object must be locked. */ -vm_page_t vm_page_alloc(object, offset) +vm_page_t +vm_page_alloc(object, offset) vm_object_t object; vm_offset_t offset; { register vm_page_t mem; - int spl; + int s; - spl = splimp(); /* XXX */ + s = splimp(); simple_lock(&vm_page_queue_free_lock); - if (vm_page_queue_free.tqh_first == NULL) { + if ( object != kernel_object && + object != kmem_object && + curproc != pageproc && curproc != &proc0 && + cnt.v_free_count < cnt.v_free_reserved) { + simple_unlock(&vm_page_queue_free_lock); - splx(spl); + splx(s); + /* + * this wakeup seems unnecessary, but there is code that + * might just check to see if there are free pages, and + * punt if there aren't. VM_WAIT does this too, but + * redundant wakeups aren't that bad... + */ + if (curproc != pageproc) + wakeup((caddr_t) &vm_pages_needed); + return(NULL); + } + if (( mem = vm_page_queue_free.tqh_first) == 0) { + simple_unlock(&vm_page_queue_free_lock); + printf("No pages???\n"); + splx(s); + /* + * comment above re: wakeups applies here too... + */ + if (curproc != pageproc) + wakeup((caddr_t) &vm_pages_needed); return(NULL); } - mem = vm_page_queue_free.tqh_first; TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); cnt.v_free_count--; simple_unlock(&vm_page_queue_free_lock); - splx(spl); VM_PAGE_INIT(mem, object, offset); + splx(s); - /* - * Decide if we should poke the pageout daemon. - * We do this if the free count is less than the low - * water mark, or if the free count is less than the high - * water mark (but above the low water mark) and the inactive - * count is less than its target. - * - * We don't have the counts locked ... if they change a little, - * it doesn't really matter. - */ +/* + * don't wakeup too often, so we wakeup the pageout daemon when + * we would be nearly out of memory. + */ + if (curproc != pageproc && + (cnt.v_free_count < cnt.v_free_reserved)) + wakeup((caddr_t) &vm_pages_needed); - if (cnt.v_free_count < cnt.v_free_min || - (cnt.v_free_count < cnt.v_free_target && - cnt.v_inactive_count < cnt.v_inactive_target)) - thread_wakeup((int)&vm_pages_needed); - return (mem); + return(mem); } /* @@ -518,6 +595,8 @@ vm_page_t vm_page_alloc(object, offset) void vm_page_free(mem) register vm_page_t mem; { + int s; + s = splimp(); vm_page_remove(mem); if (mem->flags & PG_ACTIVE) { TAILQ_REMOVE(&vm_page_queue_active, mem, pageq); @@ -532,18 +611,46 @@ void vm_page_free(mem) } if (!(mem->flags & PG_FICTITIOUS)) { - int spl; - spl = splimp(); simple_lock(&vm_page_queue_free_lock); + if (mem->wire_count) { + cnt.v_wire_count--; + mem->wire_count = 0; + } TAILQ_INSERT_TAIL(&vm_page_queue_free, mem, pageq); cnt.v_free_count++; simple_unlock(&vm_page_queue_free_lock); - splx(spl); + splx(s); + /* + * if pageout daemon needs pages, then tell it that there + * are some free. + */ + if (vm_pageout_pages_needed) + wakeup((caddr_t)&vm_pageout_pages_needed); + + /* + * wakeup processes that are waiting on memory if we + * hit a high water mark. + */ + if (cnt.v_free_count == cnt.v_free_min) { + wakeup((caddr_t)&cnt.v_free_count); + } + + /* + * wakeup scheduler process if we have lots of memory. + * this process will swapin processes. + */ + if (cnt.v_free_count == cnt.v_free_target) { + wakeup((caddr_t)&proc0); + } + } else { + splx(s); } + wakeup((caddr_t) mem); } + /* * vm_page_wire: * @@ -556,9 +663,11 @@ void vm_page_free(mem) void vm_page_wire(mem) register vm_page_t mem; { + int s; VM_PAGE_CHECK(mem); if (mem->wire_count == 0) { + s = splimp(); if (mem->flags & PG_ACTIVE) { TAILQ_REMOVE(&vm_page_queue_active, mem, pageq); cnt.v_active_count--; @@ -569,6 +678,7 @@ void vm_page_wire(mem) cnt.v_inactive_count--; mem->flags &= ~PG_INACTIVE; } + splx(s); cnt.v_wire_count++; } mem->wire_count++; @@ -585,17 +695,77 @@ void vm_page_wire(mem) void vm_page_unwire(mem) register vm_page_t mem; { + int s; VM_PAGE_CHECK(mem); - mem->wire_count--; + s = splimp(); + + if( mem->wire_count) + mem->wire_count--; if (mem->wire_count == 0) { TAILQ_INSERT_TAIL(&vm_page_queue_active, mem, pageq); cnt.v_active_count++; mem->flags |= PG_ACTIVE; cnt.v_wire_count--; } + splx(s); } +#if 0 +/* + * vm_page_deactivate: + * + * Returns the given page to the inactive list, + * indicating that no physical maps have access + * to this page. [Used by the physical mapping system.] + * + * The page queues must be locked. + */ +void +vm_page_deactivate(m) + register vm_page_t m; +{ + int spl; + VM_PAGE_CHECK(m); + + /* + * Only move active pages -- ignore locked or already + * inactive ones. + * + * XXX: sometimes we get pages which aren't wired down + * or on any queue - we need to put them on the inactive + * queue also, otherwise we lose track of them. + * Paul Mackerras (paulus@cs.anu.edu.au) 9-Jan-93. + */ + + spl = splimp(); + if (!(m->flags & PG_INACTIVE) && m->wire_count == 0 && + m->hold_count == 0) { + + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + if (m->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + m->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + m->flags |= PG_INACTIVE; + cnt.v_inactive_count++; +#define NOT_DEACTIVATE_PROTECTS +#ifndef NOT_DEACTIVATE_PROTECTS + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); +#else + if ((m->flags & PG_CLEAN) && + pmap_is_modified(VM_PAGE_TO_PHYS(m))) + m->flags &= ~PG_CLEAN; +#endif + if ((m->flags & PG_CLEAN) == 0) + m->flags |= PG_LAUNDRY; + } + splx(spl); +} +#endif +#if 1 /* * vm_page_deactivate: * @@ -608,14 +778,16 @@ void vm_page_unwire(mem) void vm_page_deactivate(m) register vm_page_t m; { + int s; VM_PAGE_CHECK(m); + s = splimp(); /* * Only move active pages -- ignore locked or already * inactive ones. */ - if (m->flags & PG_ACTIVE) { + if ((m->flags & PG_ACTIVE) && (m->hold_count == 0)) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); @@ -623,15 +795,21 @@ void vm_page_deactivate(m) m->flags |= PG_INACTIVE; cnt.v_active_count--; cnt.v_inactive_count++; +#define NOT_DEACTIVATE_PROTECTS +#ifndef NOT_DEACTIVATE_PROTECTS + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); +#else if (pmap_is_modified(VM_PAGE_TO_PHYS(m))) m->flags &= ~PG_CLEAN; +#endif if (m->flags & PG_CLEAN) m->flags &= ~PG_LAUNDRY; else m->flags |= PG_LAUNDRY; } + splx(s); } - +#endif /* * vm_page_activate: * @@ -643,8 +821,10 @@ void vm_page_deactivate(m) void vm_page_activate(m) register vm_page_t m; { + int s; VM_PAGE_CHECK(m); + s = splimp(); if (m->flags & PG_INACTIVE) { TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); cnt.v_inactive_count--; @@ -656,8 +836,12 @@ void vm_page_activate(m) TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); m->flags |= PG_ACTIVE; + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + m->act_count = 10; cnt.v_active_count++; } + splx(s); } /* @@ -668,12 +852,12 @@ void vm_page_activate(m) * be used by the zero-fill object. */ -boolean_t vm_page_zero_fill(m) +boolean_t +vm_page_zero_fill(m) vm_page_t m; { VM_PAGE_CHECK(m); - m->flags &= ~PG_CLEAN; pmap_zero_page(VM_PAGE_TO_PHYS(m)); return(TRUE); } @@ -683,14 +867,13 @@ boolean_t vm_page_zero_fill(m) * * Copy one page to another */ - -void vm_page_copy(src_m, dest_m) +void +vm_page_copy(src_m, dest_m) vm_page_t src_m; vm_page_t dest_m; { VM_PAGE_CHECK(src_m); VM_PAGE_CHECK(dest_m); - dest_m->flags &= ~PG_CLEAN; pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m)); } diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 8bf5146..e8049c4 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -107,6 +107,8 @@ struct vm_page { u_short wire_count; /* wired down maps refs (P) */ u_short flags; /* see below */ + short hold_count; /* page hold count */ + u_short act_count; /* page usage count */ vm_offset_t phys_addr; /* physical address of page */ }; @@ -209,7 +211,7 @@ simple_lock_data_t vm_page_queue_free_lock; (m)->flags &= ~PG_BUSY; \ if ((m)->flags & PG_WANTED) { \ (m)->flags &= ~PG_WANTED; \ - thread_wakeup((int) (m)); \ + wakeup((caddr_t) (m)); \ } \ } @@ -222,6 +224,8 @@ simple_lock_data_t vm_page_queue_free_lock; (mem)->flags = PG_BUSY | PG_CLEAN | PG_FAKE; \ vm_page_insert((mem), (object), (offset)); \ (mem)->wire_count = 0; \ + (mem)->hold_count = 0; \ + (mem)->act_count = 0; \ } void vm_page_activate __P((vm_page_t)); @@ -233,10 +237,32 @@ void vm_page_insert __P((vm_page_t, vm_object_t, vm_offset_t)); vm_page_t vm_page_lookup __P((vm_object_t, vm_offset_t)); void vm_page_remove __P((vm_page_t)); void vm_page_rename __P((vm_page_t, vm_object_t, vm_offset_t)); -void vm_page_startup __P((vm_offset_t *, vm_offset_t *)); +vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); void vm_page_unwire __P((vm_page_t)); void vm_page_wire __P((vm_page_t)); boolean_t vm_page_zero_fill __P((vm_page_t)); + +/* + * Keep page from being freed by the page daemon + * much of the same effect as wiring, except much lower + * overhead and should be used only for *very* temporary + * holding ("wiring"). + */ +static inline void +vm_page_hold(mem) + vm_page_t mem; +{ + mem->hold_count++; +} + +static inline void +vm_page_unhold(mem) + vm_page_t mem; +{ + if( --mem->hold_count < 0) + panic("vm_page_unhold: hold count < 0!!!"); +} + #endif /* KERNEL */ #endif /* !_VM_PAGE_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 6795405..202bf03 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1,6 +1,10 @@ /* - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. @@ -33,7 +37,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94 + * @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. @@ -60,6 +64,8 @@ * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. + * + * $Id: vm_pageout.c,v 1.20 1994/04/20 07:07:15 davidg Exp $ */ /* @@ -67,501 +73,802 @@ */ #include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/malloc.h> #include <vm/vm.h> #include <vm/vm_page.h> #include <vm/vm_pageout.h> -#ifndef VM_PAGE_FREE_MIN -#define VM_PAGE_FREE_MIN (cnt.v_free_count / 20) -#endif +extern vm_map_t kmem_map; +int vm_pages_needed; /* Event on which pageout daemon sleeps */ +int vm_pagescanner; /* Event on which pagescanner sleeps */ +int vm_pageout_free_min = 0; /* Stop pageout to wait for pagers at this free level */ + +int vm_pageout_pages_needed = 0; /* flag saying that the pageout daemon needs pages */ +int vm_page_pagesfreed; -#ifndef VM_PAGE_FREE_TARGET -#define VM_PAGE_FREE_TARGET ((cnt.v_free_min * 4) / 3) -#endif +extern int npendingio; +extern int hz; +int vm_pageout_proc_limit; +extern int nswiodone; +extern int swap_pager_full; +extern int swap_pager_ready(); -int vm_page_free_min_min = 16 * 1024; -int vm_page_free_min_max = 256 * 1024; +#define MAXREF 32767 -int vm_pages_needed; /* Event on which pageout daemon sleeps */ +#define MAXSCAN 512 /* maximum number of pages to scan in active queue */ + /* set the "clock" hands to be (MAXSCAN * 4096) Bytes */ +#define ACT_DECLINE 1 +#define ACT_ADVANCE 6 +#define ACT_MAX 300 + +#define LOWATER ((2048*1024)/NBPG) + +#define VM_PAGEOUT_PAGE_COUNT 8 +static vm_offset_t vm_space_needed; +int vm_pageout_req_do_stats; int vm_page_max_wired = 0; /* XXX max # of wired pages system-wide */ -#ifdef CLUSTERED_PAGEOUT -#define MAXPOCLUSTER (MAXPHYS/NBPG) /* XXX */ -int doclustered_pageout = 1; -#endif /* - * vm_pageout_scan does the dirty work for the pageout daemon. + * vm_pageout_clean: + * cleans a vm_page */ -void -vm_pageout_scan() +int +vm_pageout_clean(m, sync) + register vm_page_t m; + int sync; { - register vm_page_t m, next; - register int page_shortage; - register int s; - register int pages_freed; - int free; - vm_object_t object; + /* + * Clean the page and remove it from the + * laundry. + * + * We set the busy bit to cause + * potential page faults on this page to + * block. + * + * And we set pageout-in-progress to keep + * the object from disappearing during + * pageout. This guarantees that the + * page won't move from the inactive + * queue. (However, any other page on + * the inactive queue may move!) + */ + + register vm_object_t object; + register vm_pager_t pager; + int pageout_status[VM_PAGEOUT_PAGE_COUNT]; + vm_page_t ms[VM_PAGEOUT_PAGE_COUNT]; + int pageout_count; + int anyok=0; + int i; + vm_offset_t offset = m->offset; + + object = m->object; + if (!object) { + printf("pager: object missing\n"); + return 0; + } /* - * Only continue when we want more pages to be "free" + * Try to collapse the object before + * making a pager for it. We must + * unlock the page queues first. + * We try to defer the creation of a pager + * until all shadows are not paging. This + * allows vm_object_collapse to work better and + * helps control swap space size. + * (J. Dyson 11 Nov 93) */ - cnt.v_rev++; + if (!object->pager && + cnt.v_free_count < vm_pageout_free_min) + return 0; - s = splimp(); - simple_lock(&vm_page_queue_free_lock); - free = cnt.v_free_count; - simple_unlock(&vm_page_queue_free_lock); - splx(s); + if (!object->pager && + object->shadow && + object->shadow->paging_in_progress) + return 0; - if (free < cnt.v_free_target) { - swapout_threads(); + if( !sync) { + if (object->shadow) { + vm_object_collapse(object); + if (!vm_page_lookup(object, offset)) + return 0; + } - /* - * Be sure the pmap system is updated so - * we can scan the inactive queue. - */ + if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { + return 0; + } + } + + pageout_count = 1; + ms[0] = m; + + if( pager = object->pager) { + for(i=1;i<VM_PAGEOUT_PAGE_COUNT;i++) { + if( ms[i] = vm_page_lookup( object, offset+i*NBPG)) { + if((((ms[i]->flags & (PG_CLEAN|PG_INACTIVE|PG_BUSY)) == PG_INACTIVE) + || (( ms[i]->flags & PG_CLEAN) == 0 && sync == VM_PAGEOUT_FORCE)) + && (ms[i]->wire_count == 0) + && (ms[i]->hold_count == 0)) + pageout_count++; + else + break; + } else + break; + } + for(i=0;i<pageout_count;i++) { + ms[i]->flags |= PG_BUSY; + pmap_page_protect(VM_PAGE_TO_PHYS(ms[i]), VM_PROT_READ); + } + object->paging_in_progress += pageout_count; + cnt.v_pageouts += pageout_count; + } else { + + m->flags |= PG_BUSY; - pmap_update(); + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ); + + cnt.v_pageouts++; + + object->paging_in_progress++; + + pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, + object->size, VM_PROT_ALL, 0); + if (pager != NULL) { + vm_object_setpager(object, pager, 0, FALSE); + } } /* - * Acquire the resident page system lock, - * as we may be changing what's resident quite a bit. + * If there is no pager for the page, + * use the default pager. If there's + * no place to put the page at the + * moment, leave it in the laundry and + * hope that there will be paging space + * later. */ - vm_page_lock_queues(); - /* - * Start scanning the inactive queue for pages we can free. - * We keep scanning until we have enough free pages or - * we have scanned through the entire queue. If we - * encounter dirty pages, we start cleaning them. - */ + if ((pager && pager->pg_type == PG_SWAP) || + cnt.v_free_count >= vm_pageout_free_min) { + if( pageout_count == 1) { + pageout_status[0] = pager ? + vm_pager_put(pager, m, + ((sync || (object == kernel_object)) ? TRUE: FALSE)) : + VM_PAGER_FAIL; + } else { + if( !pager) { + for(i=0;i<pageout_count;i++) + pageout_status[i] = VM_PAGER_FAIL; + } else { + vm_pager_put_pages(pager, ms, pageout_count, + ((sync || (object == kernel_object)) ? TRUE : FALSE), + pageout_status); + } + } + + } else { + for(i=0;i<pageout_count;i++) + pageout_status[i] = VM_PAGER_FAIL; + } - pages_freed = 0; - for (m = vm_page_queue_inactive.tqh_first; m != NULL; m = next) { - s = splimp(); - simple_lock(&vm_page_queue_free_lock); - free = cnt.v_free_count; - simple_unlock(&vm_page_queue_free_lock); - splx(s); - if (free >= cnt.v_free_target) + for(i=0;i<pageout_count;i++) { + switch (pageout_status[i]) { + case VM_PAGER_OK: + ms[i]->flags &= ~PG_LAUNDRY; + ++anyok; + break; + case VM_PAGER_PEND: + ms[i]->flags &= ~PG_LAUNDRY; + ++anyok; + break; + case VM_PAGER_BAD: + /* + * Page outside of range of object. + * Right now we essentially lose the + * changes by pretending it worked. + */ + ms[i]->flags &= ~PG_LAUNDRY; + ms[i]->flags |= PG_CLEAN; + pmap_clear_modify(VM_PAGE_TO_PHYS(ms[i])); + break; + case VM_PAGER_ERROR: + case VM_PAGER_FAIL: + /* + * If page couldn't be paged out, then + * reactivate the page so it doesn't + * clog the inactive list. (We will + * try paging out it again later). + */ + if (ms[i]->flags & PG_INACTIVE) + vm_page_activate(ms[i]); + break; + case VM_PAGER_AGAIN: break; + } - cnt.v_scan++; - next = m->pageq.tqe_next; /* - * If the page has been referenced, move it back to the - * active queue. + * If the operation is still going, leave + * the page busy to block all other accesses. + * Also, leave the paging in progress + * indicator set so that we don't attempt an + * object collapse. */ - if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { - vm_page_activate(m); - cnt.v_reactivated++; - continue; + if (pageout_status[i] != VM_PAGER_PEND) { + PAGE_WAKEUP(ms[i]); + if (--object->paging_in_progress == 0) + wakeup((caddr_t) object); + if (pmap_is_referenced(VM_PAGE_TO_PHYS(ms[i]))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(ms[i])); + if( ms[i]->flags & PG_INACTIVE) + vm_page_activate(ms[i]); + } } + } + return anyok; +} +/* + * vm_pageout_object_deactivate_pages + * + * deactivate enough pages to satisfy the inactive target + * requirements or if vm_page_proc_limit is set, then + * deactivate all of the pages in the object and its + * shadows. + * + * The object and map must be locked. + */ +int +vm_pageout_object_deactivate_pages(map, object, count) + vm_map_t map; + vm_object_t object; + int count; +{ + register vm_page_t p, next; + int rcount; + int s; + int dcount; + + dcount = 0; + if (count == 0) + count = 1; + + if (object->shadow) { + int scount = count; + if( object->shadow->ref_count > 1) + scount /= object->shadow->ref_count; + if( scount) + dcount += vm_pageout_object_deactivate_pages(map, object->shadow, scount); + } + + if (object->paging_in_progress) + return dcount; + + /* + * scan the objects entire memory queue + */ + rcount = object->resident_page_count; + p = object->memq.tqh_first; + while (p && (rcount-- > 0)) { + next = p->listq.tqe_next; + vm_page_lock_queues(); /* - * If the page is clean, free it up. + * if a page is active, not wired and is in the processes pmap, + * then deactivate the page. */ - if (m->flags & PG_CLEAN) { - object = m->object; - if (vm_object_lock_try(object)) { - pmap_page_protect(VM_PAGE_TO_PHYS(m), - VM_PROT_NONE); - vm_page_free(m); - pages_freed++; - cnt.v_dfree++; - vm_object_unlock(object); + if ((p->flags & (PG_ACTIVE|PG_BUSY)) == PG_ACTIVE && + p->wire_count == 0 && + p->hold_count == 0 && + pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) { + if (!pmap_is_referenced(VM_PAGE_TO_PHYS(p))) { + p->act_count -= min(p->act_count, ACT_DECLINE); + /* + * if the page act_count is zero -- then we deactivate + */ + if (!p->act_count) { + vm_page_deactivate(p); + pmap_page_protect(VM_PAGE_TO_PHYS(p), + VM_PROT_NONE); + /* + * else if on the next go-around we will deactivate the page + * we need to place the page on the end of the queue to age + * the other pages in memory. + */ + } else { + TAILQ_REMOVE(&vm_page_queue_active, p, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); + TAILQ_REMOVE(&object->memq, p, listq); + TAILQ_INSERT_TAIL(&object->memq, p, listq); + } + /* + * see if we are done yet + */ + if (p->flags & PG_INACTIVE) { + --count; + ++dcount; + if (count <= 0 && + cnt.v_inactive_count > cnt.v_inactive_target) { + vm_page_unlock_queues(); + return dcount; + } + } + + } else { + /* + * Move the page to the bottom of the queue. + */ + pmap_clear_reference(VM_PAGE_TO_PHYS(p)); + if (p->act_count < ACT_MAX) + p->act_count += ACT_ADVANCE; + + TAILQ_REMOVE(&vm_page_queue_active, p, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); + TAILQ_REMOVE(&object->memq, p, listq); + TAILQ_INSERT_TAIL(&object->memq, p, listq); } - continue; } + vm_page_unlock_queues(); + p = next; + } + return dcount; +} + + +/* + * deactivate some number of pages in a map, try to do it fairly, but + * that is really hard to do. + */ + +void +vm_pageout_map_deactivate_pages(map, entry, count, freeer) + vm_map_t map; + vm_map_entry_t entry; + int *count; + int (*freeer)(vm_map_t, vm_object_t, int); +{ + vm_map_t tmpm; + vm_map_entry_t tmpe; + vm_object_t obj; + if (*count <= 0) + return; + vm_map_reference(map); + if (!lock_try_read(&map->lock)) { + vm_map_deallocate(map); + return; + } + if (entry == 0) { + tmpe = map->header.next; + while (tmpe != &map->header && *count > 0) { + vm_pageout_map_deactivate_pages(map, tmpe, count, freeer); + tmpe = tmpe->next; + }; + } else if (entry->is_sub_map || entry->is_a_map) { + tmpm = entry->object.share_map; + tmpe = tmpm->header.next; + while (tmpe != &tmpm->header && *count > 0) { + vm_pageout_map_deactivate_pages(tmpm, tmpe, count, freeer); + tmpe = tmpe->next; + }; + } else if (obj = entry->object.vm_object) { + *count -= (*freeer)(map, obj, *count); + } + lock_read_done(&map->lock); + vm_map_deallocate(map); + return; +} + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + */ +int +vm_pageout_scan() +{ + vm_page_t m; + int page_shortage, maxscan, maxlaunder; + int pages_freed, free, nproc; + int desired_free; + vm_page_t next; + struct proc *p; + vm_object_t object; + int s; + int force_wakeup = 0; + +morefree: + /* + * scan the processes for exceeding their rlimits or if process + * is swapped out -- deactivate pages + */ + +rescanproc1: + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + vm_offset_t size; + int overage; + vm_offset_t limit; + /* - * If the page is dirty but already being washed, skip it. + * if this is a system process or if we have already + * looked at this process, skip it. */ - if ((m->flags & PG_LAUNDRY) == 0) + if (p->p_flag & (P_SYSTEM|P_WEXIT)) { continue; + } /* - * Otherwise the page is dirty and still in the laundry, - * so we start the cleaning operation and remove it from - * the laundry. + * if the process is in a non-running type state, + * don't touch it. */ - object = m->object; - if (!vm_object_lock_try(object)) + if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; - cnt.v_pageouts++; -#ifdef CLUSTERED_PAGEOUT - if (object->pager && - vm_pager_cancluster(object->pager, PG_CLUSTERPUT)) - vm_pageout_cluster(m, object); - else -#endif - vm_pageout_page(m, object); - thread_wakeup((int) object); - vm_object_unlock(object); + } + /* - * Former next page may no longer even be on the inactive - * queue (due to potential blocking in the pager with the - * queues unlocked). If it isn't, we just start over. + * get a limit */ - if (next && (next->flags & PG_INACTIVE) == 0) - next = vm_page_queue_inactive.tqh_first; - } - - /* - * Compute the page shortage. If we are still very low on memory - * be sure that we will move a minimal amount of pages from active - * to inactive. - */ - - page_shortage = cnt.v_inactive_target - cnt.v_inactive_count; - if (page_shortage <= 0 && pages_freed == 0) - page_shortage = 1; - - while (page_shortage > 0) { + limit = min(p->p_rlimit[RLIMIT_RSS].rlim_cur, + p->p_rlimit[RLIMIT_RSS].rlim_max); + /* - * Move some more pages from active to inactive. + * let processes that are swapped out really be swapped out + * set the limit to nothing (will force a swap-out.) */ + if ((p->p_flag & P_INMEM) == 0) + limit = 0; + + size = p->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG; + if (size >= limit) { + overage = (size - limit) / NBPG; + vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, + (vm_map_entry_t) 0, &overage, vm_pageout_object_deactivate_pages); + } - if ((m = vm_page_queue_active.tqh_first) == NULL) - break; - vm_page_deactivate(m); - page_shortage--; } - vm_page_unlock_queues(); -} + if (((cnt.v_free_count + cnt.v_inactive_count) >= + (cnt.v_inactive_target + cnt.v_free_target)) && + (cnt.v_free_count >= cnt.v_free_target)) + return force_wakeup; -/* - * Called with object and page queues locked. - * If reactivate is TRUE, a pager error causes the page to be - * put back on the active queue, ow it is left on the inactive queue. - */ -void -vm_pageout_page(m, object) - vm_page_t m; - vm_object_t object; -{ - vm_pager_t pager; - int pageout_status; + pages_freed = 0; + desired_free = cnt.v_free_target; /* - * We set the busy bit to cause potential page faults on - * this page to block. - * - * We also set pageout-in-progress to keep the object from - * disappearing during pageout. This guarantees that the - * page won't move from the inactive queue. (However, any - * other page on the inactive queue may move!) + * Start scanning the inactive queue for pages we can free. + * We keep scanning until we have enough free pages or + * we have scanned through the entire queue. If we + * encounter dirty pages, we start cleaning them. */ - pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); - m->flags |= PG_BUSY; - /* - * Try to collapse the object before making a pager for it. - * We must unlock the page queues first. - */ - vm_page_unlock_queues(); - if (object->pager == NULL) - vm_object_collapse(object); + maxlaunder = (cnt.v_free_target - cnt.v_free_count); + maxscan = cnt.v_inactive_count; +rescan1: + m = vm_page_queue_inactive.tqh_first; + while (m && (maxscan-- > 0) && + (cnt.v_free_count < desired_free) ) { + vm_page_t next; - object->paging_in_progress++; - vm_object_unlock(object); + next = m->pageq.tqe_next; - /* - * Do a wakeup here in case the following operations block. - */ - thread_wakeup((int) &cnt.v_free_count); + if( (m->flags & PG_INACTIVE) == 0) { + printf("vm_pageout_scan: page not inactive?"); + continue; + } - /* - * If there is no pager for the page, use the default pager. - * If there is no place to put the page at the moment, - * leave it in the laundry and hope that there will be - * paging space later. - */ - if ((pager = object->pager) == NULL) { - pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, object->size, - VM_PROT_ALL, (vm_offset_t)0); - if (pager != NULL) - vm_object_setpager(object, pager, 0, FALSE); - } - pageout_status = pager ? vm_pager_put(pager, m, FALSE) : VM_PAGER_FAIL; - vm_object_lock(object); - vm_page_lock_queues(); - - switch (pageout_status) { - case VM_PAGER_OK: - case VM_PAGER_PEND: - cnt.v_pgpgout++; - m->flags &= ~PG_LAUNDRY; - break; - case VM_PAGER_BAD: /* - * Page outside of range of object. Right now we - * essentially lose the changes by pretending it - * worked. - * - * XXX dubious, what should we do? + * activate held pages */ - m->flags &= ~PG_LAUNDRY; - m->flags |= PG_CLEAN; - pmap_clear_modify(VM_PAGE_TO_PHYS(m)); - break; - case VM_PAGER_AGAIN: - { - extern int lbolt; + if (m->hold_count != 0) { + vm_page_activate(m); + m = next; + continue; + } /* - * FAIL on a write is interpreted to mean a resource - * shortage, so we put pause for awhile and try again. - * XXX could get stuck here. + * dont mess with busy pages */ - (void) tsleep((caddr_t)&lbolt, PZERO|PCATCH, "pageout", 0); - break; - } - case VM_PAGER_FAIL: - case VM_PAGER_ERROR: + if (m->flags & PG_BUSY) { + m = next; + continue; + } + /* - * If page couldn't be paged out, then reactivate - * the page so it doesn't clog the inactive list. - * (We will try paging out it again later). + * if page is clean and but the page has been referenced, + * then reactivate the page, but if we are very low on memory + * or the page has not been referenced, then we free it to the + * vm system. */ - vm_page_activate(m); - cnt.v_reactivated++; - break; - } + if (m->flags & PG_CLEAN) { + if ((cnt.v_free_count > vm_pageout_free_min) /* XXX */ + && pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + vm_page_activate(m); + } else if (!m->act_count) { + pmap_page_protect(VM_PAGE_TO_PHYS(m), + VM_PROT_NONE); + vm_page_free(m); + ++pages_freed; + } else { + m->act_count -= min(m->act_count, ACT_DECLINE); + TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + } + } else if ((m->flags & PG_LAUNDRY) && maxlaunder > 0) { + int written; + if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + vm_page_activate(m); + m = next; + continue; + } + /* + * If a page is dirty, then it is either + * being washed (but not yet cleaned) + * or it is still in the laundry. If it is + * still in the laundry, then we start the + * cleaning operation. + */ - pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + if (written = vm_pageout_clean(m,0)) { + maxlaunder -= written; + } + /* + * if the next page has been re-activated, start scanning again + */ + if (next && (next->flags & PG_INACTIVE) == 0) + goto rescan1; + } else if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + vm_page_activate(m); + } + m = next; + } /* - * If the operation is still going, leave the page busy - * to block all other accesses. Also, leave the paging - * in progress indicator set so that we don't attempt an - * object collapse. + * now check malloc area or swap processes out if we are in low + * memory conditions */ - if (pageout_status != VM_PAGER_PEND) { - m->flags &= ~PG_BUSY; - PAGE_WAKEUP(m); - object->paging_in_progress--; + if (cnt.v_free_count <= cnt.v_free_min) { + /* + * swap out inactive processes + */ + swapout_threads(); } -} - -#ifdef CLUSTERED_PAGEOUT -#define PAGEOUTABLE(p) \ - ((((p)->flags & (PG_INACTIVE|PG_CLEAN|PG_LAUNDRY)) == \ - (PG_INACTIVE|PG_LAUNDRY)) && !pmap_is_referenced(VM_PAGE_TO_PHYS(p))) - -/* - * Attempt to pageout as many contiguous (to ``m'') dirty pages as possible - * from ``object''. Using information returned from the pager, we assemble - * a sorted list of contiguous dirty pages and feed them to the pager in one - * chunk. Called with paging queues and object locked. Also, object must - * already have a pager. - */ -void -vm_pageout_cluster(m, object) - vm_page_t m; - vm_object_t object; -{ - vm_offset_t offset, loff, hoff; - vm_page_t plist[MAXPOCLUSTER], *plistp, p; - int postatus, ix, count; /* - * Determine the range of pages that can be part of a cluster - * for this object/offset. If it is only our single page, just - * do it normally. + * Compute the page shortage. If we are still very low on memory + * be sure that we will move a minimal amount of pages from active + * to inactive. */ - vm_pager_cluster(object->pager, m->offset, &loff, &hoff); - if (hoff - loff == PAGE_SIZE) { - vm_pageout_page(m, object); - return; + + page_shortage = cnt.v_inactive_target - + (cnt.v_free_count + cnt.v_inactive_count); + + if (page_shortage <= 0) { + if (pages_freed == 0) { + if( cnt.v_free_count < cnt.v_free_min) { + page_shortage = cnt.v_free_min - cnt.v_free_count; + } else if(((cnt.v_free_count + cnt.v_inactive_count) < + (cnt.v_free_min + cnt.v_inactive_target))) { + page_shortage = 1; + } else { + page_shortage = 0; + } + } + } - plistp = plist; + maxscan = cnt.v_active_count; + m = vm_page_queue_active.tqh_first; + while (m && maxscan-- && (page_shortage > 0)) { - /* - * Target page is always part of the cluster. - */ - pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); - m->flags |= PG_BUSY; - plistp[atop(m->offset - loff)] = m; - count = 1; + next = m->pageq.tqe_next; - /* - * Backup from the given page til we find one not fulfilling - * the pageout criteria or we hit the lower bound for the - * cluster. For each page determined to be part of the - * cluster, unmap it and busy it out so it won't change. - */ - ix = atop(m->offset - loff); - offset = m->offset; - while (offset > loff && count < MAXPOCLUSTER-1) { - p = vm_page_lookup(object, offset - PAGE_SIZE); - if (p == NULL || !PAGEOUTABLE(p)) - break; - pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); - p->flags |= PG_BUSY; - plistp[--ix] = p; - offset -= PAGE_SIZE; - count++; + /* + * Don't deactivate pages that are busy. + */ + if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { + m = next; + continue; + } + + if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + if (m->act_count < ACT_MAX) + m->act_count += ACT_ADVANCE; + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + } else { + m->act_count -= min(m->act_count, ACT_DECLINE); + + /* + * if the page act_count is zero -- then we deactivate + */ + if (!m->act_count) { + vm_page_deactivate(m); + --page_shortage; + /* + * else if on the next go-around we will deactivate the page + * we need to place the page on the end of the queue to age + * the other pages in memory. + */ + } else { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + } + } + + m = next; } - plistp += atop(offset - loff); - loff = offset; /* - * Now do the same moving forward from the target. + * if we have not freed any pages and we are desparate for memory + * then we keep trying until we get some (any) memory. */ - ix = atop(m->offset - loff) + 1; - offset = m->offset + PAGE_SIZE; - while (offset < hoff && count < MAXPOCLUSTER) { - p = vm_page_lookup(object, offset); - if (p == NULL || !PAGEOUTABLE(p)) - break; - pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); - p->flags |= PG_BUSY; - plistp[ix++] = p; - offset += PAGE_SIZE; - count++; + + if( !force_wakeup && (swap_pager_full || !force_wakeup || + (pages_freed == 0 && (cnt.v_free_count < cnt.v_free_min)))){ + vm_pager_sync(); + force_wakeup = 1; + goto morefree; } - hoff = offset; + vm_page_pagesfreed += pages_freed; + return force_wakeup; +} - /* - * Pageout the page. - * Unlock everything and do a wakeup prior to the pager call - * in case it blocks. - */ - vm_page_unlock_queues(); - object->paging_in_progress++; - vm_object_unlock(object); -again: - thread_wakeup((int) &cnt.v_free_count); - postatus = vm_pager_put_pages(object->pager, plistp, count, FALSE); - /* - * XXX rethink this - */ - if (postatus == VM_PAGER_AGAIN) { - extern int lbolt; +void +vm_pagescan() +{ + int maxscan, pages_scanned, pages_referenced, nextscan, scantick = hz/20; + int m_ref, next_ref; + vm_page_t m, next; - (void) tsleep((caddr_t)&lbolt, PZERO|PCATCH, "pageout", 0); - goto again; - } else if (postatus == VM_PAGER_BAD) - panic("vm_pageout_cluster: VM_PAGER_BAD"); - vm_object_lock(object); - vm_page_lock_queues(); + (void) spl0(); + + nextscan = scantick; + +scanloop: + + pages_scanned = 0; + pages_referenced = 0; + maxscan = min(cnt.v_active_count, MAXSCAN); /* - * Loop through the affected pages, reflecting the outcome of - * the operation. + * Gather statistics on page usage. */ - for (ix = 0; ix < count; ix++) { - p = *plistp++; - switch (postatus) { - case VM_PAGER_OK: - case VM_PAGER_PEND: - cnt.v_pgpgout++; - p->flags &= ~PG_LAUNDRY; - break; - case VM_PAGER_FAIL: - case VM_PAGER_ERROR: - /* - * Pageout failed, reactivate the target page so it - * doesn't clog the inactive list. Other pages are - * left as they are. - */ - if (p == m) { - vm_page_activate(p); - cnt.v_reactivated++; - } - break; - } - pmap_clear_reference(VM_PAGE_TO_PHYS(p)); + m = vm_page_queue_active.tqh_first; + while (m && (maxscan-- > 0)) { + + ++pages_scanned; + + next = m->pageq.tqe_next; + /* - * If the operation is still going, leave the page busy - * to block all other accesses. + * Dont mess with pages that are busy. */ - if (postatus != VM_PAGER_PEND) { - p->flags &= ~PG_BUSY; - PAGE_WAKEUP(p); + if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + m = next; + continue; + } + /* + * Advance pages that have been referenced, decline pages that + * have not. + */ + if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + pages_referenced++; + if (m->act_count < ACT_MAX) + m->act_count += ACT_ADVANCE; + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + } else { + m->act_count -= min(m->act_count, ACT_DECLINE); + /* + * if the page act_count is zero, and we are low on mem -- then we deactivate + */ + if (!m->act_count && + (cnt.v_free_count+cnt.v_inactive_count < cnt.v_free_target+cnt.v_inactive_target )) { + vm_page_deactivate(m); + /* + * else if on the next go-around we will deactivate the page + * we need to place the page on the end of the queue to age + * the other pages in memory. + */ + } else { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + } } + m = next; } - /* - * If the operation is still going, leave the paging in progress - * indicator set so that we don't attempt an object collapse. - */ - if (postatus != VM_PAGER_PEND) - object->paging_in_progress--; + if (pages_referenced) { + nextscan = (pages_scanned / pages_referenced) * scantick; + nextscan = max(nextscan, scantick); + nextscan = min(nextscan, hz); + } else + nextscan = hz; + tsleep((caddr_t) &vm_pagescanner, PVM, "scanw", nextscan); + + goto scanloop; } -#endif /* * vm_pageout is the high level pageout daemon. */ - -void vm_pageout() +void +vm_pageout() { + extern npendingio, swiopend; + static nowakeup; (void) spl0(); /* * Initialize some paging parameters. */ - if (cnt.v_free_min == 0) { - cnt.v_free_min = VM_PAGE_FREE_MIN; - vm_page_free_min_min /= cnt.v_page_size; - vm_page_free_min_max /= cnt.v_page_size; - if (cnt.v_free_min < vm_page_free_min_min) - cnt.v_free_min = vm_page_free_min_min; - if (cnt.v_free_min > vm_page_free_min_max) - cnt.v_free_min = vm_page_free_min_max; - } - - if (cnt.v_free_target == 0) - cnt.v_free_target = VM_PAGE_FREE_TARGET; - - if (cnt.v_free_target <= cnt.v_free_min) - cnt.v_free_target = cnt.v_free_min + 1; - - /* XXX does not really belong here */ +vmretry: + cnt.v_free_min = 12; + cnt.v_free_reserved = 8; + if (cnt.v_free_min < 8) + cnt.v_free_min = 8; + if (cnt.v_free_min > 32) + cnt.v_free_min = 32; + vm_pageout_free_min = 4; + cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; + cnt.v_inactive_target = cnt.v_free_count / 12; + cnt.v_free_min += cnt.v_free_reserved; + + /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; + + (void) swap_pager_alloc(0, 0, 0, 0); + /* * The pageout daemon is never done, so loop * forever. */ - - simple_lock(&vm_pages_needed_lock); while (TRUE) { - thread_sleep((int) &vm_pages_needed, &vm_pages_needed_lock, - FALSE); - /* - * Compute the inactive target for this scan. - * We need to keep a reasonable amount of memory in the - * inactive list to better simulate LRU behavior. - */ - cnt.v_inactive_target = - (cnt.v_active_count + cnt.v_inactive_count) / 3; - if (cnt.v_inactive_target <= cnt.v_free_target) - cnt.v_inactive_target = cnt.v_free_target + 1; - + int force_wakeup; + extern struct loadavg averunnable; +/* + cnt.v_free_min = 12 + averunnable.ldavg[0] / 1024; + cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; + cnt.v_inactive_target = cnt.v_free_target*2; +*/ + + tsleep((caddr_t) &vm_pages_needed, PVM, "psleep", 0); + + vm_pager_sync(); /* - * Only make a scan if we are likely to do something. - * Otherwise we might have been awakened by a pager - * to clean up async pageouts. + * The force wakeup hack added to eliminate delays and potiential + * deadlock. It was possible for the page daemon to indefintely + * postpone waking up a process that it might be waiting for memory + * on. The putmulti stuff seems to have aggravated the situation. */ - if (cnt.v_free_count < cnt.v_free_target || - cnt.v_inactive_count < cnt.v_inactive_target) - vm_pageout_scan(); + force_wakeup = vm_pageout_scan(); vm_pager_sync(); - simple_lock(&vm_pages_needed_lock); - thread_wakeup((int) &cnt.v_free_count); + if( force_wakeup) + wakeup( (caddr_t) &cnt.v_free_count); + cnt.v_scan++; + wakeup((caddr_t) kmem_map); } } + diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h index a82a0ea..834aee5 100644 --- a/sys/vm/vm_pageout.h +++ b/sys/vm/vm_pageout.h @@ -72,7 +72,11 @@ extern int vm_pages_needed; /* should be some "event" structure */ simple_lock_data_t vm_pages_needed_lock; +extern int vm_pageout_pages_needed; +#define VM_PAGEOUT_ASYNC 0 +#define VM_PAGEOUT_SYNC 1 +#define VM_PAGEOUT_FORCE 2 /* * Exported routines. @@ -82,15 +86,27 @@ simple_lock_data_t vm_pages_needed_lock; * Signal pageout-daemon and wait for it. */ -#define VM_WAIT { \ - simple_lock(&vm_pages_needed_lock); \ - thread_wakeup((int)&vm_pages_needed); \ - thread_sleep((int)&cnt.v_free_count, \ - &vm_pages_needed_lock, FALSE); \ - } +#define VM_WAIT vm_wait() + +inline static void vm_wait() { + extern struct proc *curproc, *pageproc; + int s; + s = splhigh(); + if (curproc == pageproc) { + vm_pageout_pages_needed = 1; + tsleep((caddr_t) &vm_pageout_pages_needed, PSWP, "vmwait", 0); + vm_pageout_pages_needed = 0; + } else { + wakeup((caddr_t) &vm_pages_needed); + tsleep((caddr_t) &cnt.v_free_count, PVM, "vmwait", 0); + } + splx(s); +} + + #ifdef KERNEL void vm_pageout __P((void)); -void vm_pageout_scan __P((void)); +int vm_pageout_scan __P((void)); void vm_pageout_page __P((vm_page_t, vm_object_t)); void vm_pageout_cluster __P((vm_page_t, vm_object_t)); #endif diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index 7123abb..1e4b201 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -75,34 +75,14 @@ #include <vm/vm_page.h> #include <vm/vm_kern.h> -#ifdef SWAPPAGER extern struct pagerops swappagerops; -#endif - -#ifdef VNODEPAGER extern struct pagerops vnodepagerops; -#endif - -#ifdef DEVPAGER extern struct pagerops devicepagerops; -#endif struct pagerops *pagertab[] = { -#ifdef SWAPPAGER &swappagerops, /* PG_SWAP */ -#else - NULL, -#endif -#ifdef VNODEPAGER &vnodepagerops, /* PG_VNODE */ -#else - NULL, -#endif -#ifdef DEVPAGER &devicepagerops, /* PG_DEV */ -#else - NULL, -#endif }; int npagers = sizeof (pagertab) / sizeof (pagertab[0]); @@ -118,6 +98,7 @@ struct pagerops *dfltpagerops = NULL; /* default pager */ */ #define PAGER_MAP_SIZE (4 * 1024 * 1024) +int pager_map_size = PAGER_MAP_SIZE; vm_map_t pager_map; boolean_t pager_map_wanted; vm_offset_t pager_sva, pager_eva; @@ -130,8 +111,10 @@ vm_pager_init() /* * Allocate a kernel submap for tracking get/put page mappings */ +/* pager_map = kmem_suballoc(kernel_map, &pager_sva, &pager_eva, PAGER_MAP_SIZE, FALSE); +*/ /* * Initialize known pagers */ @@ -173,38 +156,61 @@ vm_pager_deallocate(pager) (*pager->pg_ops->pgo_dealloc)(pager); } + int -vm_pager_get_pages(pager, mlist, npages, sync) +vm_pager_get_pages(pager, m, count, reqpage, sync) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t *m; + int count; + int reqpage; boolean_t sync; { - int rv; + extern boolean_t vm_page_zero_fill(); + extern int vm_pageout_count; + int i; if (pager == NULL) { - rv = VM_PAGER_OK; - while (npages--) - if (!vm_page_zero_fill(*mlist)) { - rv = VM_PAGER_FAIL; - break; - } else - mlist++; - return (rv); + for (i=0;i<count;i++) { + if( i != reqpage) { + PAGE_WAKEUP(m[i]); + vm_page_free(m[i]); + } + } + vm_page_zero_fill(m[reqpage]); + return VM_PAGER_OK; + } + + if( pager->pg_ops->pgo_getpages == 0) { + for(i=0;i<count;i++) { + if( i != reqpage) { + PAGE_WAKEUP(m[i]); + vm_page_free(m[i]); + } + } + return(VM_PAGER_GET(pager, m[reqpage], sync)); + } else { + return(VM_PAGER_GET_MULTI(pager, m, count, reqpage, sync)); } - return ((*pager->pg_ops->pgo_getpages)(pager, mlist, npages, sync)); } int -vm_pager_put_pages(pager, mlist, npages, sync) +vm_pager_put_pages(pager, m, count, sync, rtvals) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t *m; + int count; boolean_t sync; + int *rtvals; { - if (pager == NULL) - panic("vm_pager_put_pages: null pager"); - return ((*pager->pg_ops->pgo_putpages)(pager, mlist, npages, sync)); + int i; + + if( pager->pg_ops->pgo_putpages) + return(VM_PAGER_PUT_MULTI(pager, m, count, sync, rtvals)); + else { + for(i=0;i<count;i++) { + rtvals[i] = VM_PAGER_PUT( pager, m[i], sync); + } + return rtvals[0]; + } } boolean_t @@ -228,9 +234,10 @@ vm_pager_sync() for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops) - (*(*pgops)->pgo_putpages)(NULL, NULL, 0, FALSE); + (*(*pgops)->pgo_putpage)(NULL, NULL, 0); } +#if 0 void vm_pager_cluster(pager, offset, loff, hoff) vm_pager_t pager; @@ -242,91 +249,25 @@ vm_pager_cluster(pager, offset, loff, hoff) panic("vm_pager_cluster: null pager"); return ((*pager->pg_ops->pgo_cluster)(pager, offset, loff, hoff)); } - -void -vm_pager_clusternull(pager, offset, loff, hoff) - vm_pager_t pager; - vm_offset_t offset; - vm_offset_t *loff; - vm_offset_t *hoff; -{ - panic("vm_pager_nullcluster called"); -} +#endif vm_offset_t -vm_pager_map_pages(mlist, npages, canwait) - vm_page_t *mlist; - int npages; - boolean_t canwait; +vm_pager_map_page(m) + vm_page_t m; { - vm_offset_t kva, va; - vm_size_t size; - vm_page_t m; + vm_offset_t kva; - /* - * Allocate space in the pager map, if none available return 0. - * This is basically an expansion of kmem_alloc_wait with optional - * blocking on no space. - */ - size = npages * PAGE_SIZE; - vm_map_lock(pager_map); - while (vm_map_findspace(pager_map, 0, size, &kva)) { - if (!canwait) { - vm_map_unlock(pager_map); - return (0); - } - pager_map_wanted = TRUE; - vm_map_unlock(pager_map); - (void) tsleep(pager_map, PVM, "pager_map", 0); - vm_map_lock(pager_map); - } - vm_map_insert(pager_map, NULL, 0, kva, kva + size); - vm_map_unlock(pager_map); - - for (va = kva; npages--; va += PAGE_SIZE) { - m = *mlist++; -#ifdef DEBUG - if ((m->flags & PG_BUSY) == 0) - panic("vm_pager_map_pages: page not busy"); - if (m->flags & PG_PAGEROWNED) - panic("vm_pager_map_pages: page already in pager"); -#endif -#ifdef DEBUG - m->flags |= PG_PAGEROWNED; -#endif - pmap_enter(vm_map_pmap(pager_map), va, VM_PAGE_TO_PHYS(m), - VM_PROT_DEFAULT, TRUE); - } - return (kva); + kva = kmem_alloc_wait(pager_map, PAGE_SIZE); + pmap_enter(vm_map_pmap(pager_map), kva, VM_PAGE_TO_PHYS(m), + VM_PROT_DEFAULT, TRUE); + return(kva); } void -vm_pager_unmap_pages(kva, npages) +vm_pager_unmap_page(kva) vm_offset_t kva; - int npages; { - vm_size_t size = npages * PAGE_SIZE; - -#ifdef DEBUG - vm_offset_t va; - vm_page_t m; - int np = npages; - - for (va = kva; np--; va += PAGE_SIZE) { - m = vm_pager_atop(va); - if (m->flags & PG_PAGEROWNED) - m->flags &= ~PG_PAGEROWNED; - else - printf("vm_pager_unmap_pages: %x(%x/%x) not owned\n", - m, va, VM_PAGE_TO_PHYS(m)); - } -#endif - pmap_remove(vm_map_pmap(pager_map), kva, kva + size); - vm_map_lock(pager_map); - (void) vm_map_delete(pager_map, kva, kva + size); - if (pager_map_wanted) - wakeup(pager_map); - vm_map_unlock(pager_map); + kmem_free_wakeup(pager_map, kva, PAGE_SIZE); } vm_page_t diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h index e4659c2..3e20e50 100644 --- a/sys/vm/vm_pager.h +++ b/sys/vm/vm_pager.h @@ -1,3 +1,4 @@ + /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 @@ -74,17 +75,26 @@ struct pagerops { __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); void (*pgo_dealloc) /* Disassociate. */ __P((vm_pager_t)); + int (*pgo_getpage) + __P((vm_pager_t, vm_page_t, boolean_t)); int (*pgo_getpages) /* Get (read) page. */ - __P((vm_pager_t, vm_page_t *, int, boolean_t)); + __P((vm_pager_t, vm_page_t *, int, int, boolean_t)); + int (*pgo_putpage) + __P((vm_pager_t, vm_page_t, boolean_t)); int (*pgo_putpages) /* Put (write) page. */ - __P((vm_pager_t, vm_page_t *, int, boolean_t)); + __P((vm_pager_t, vm_page_t *, int, boolean_t, int *)); boolean_t (*pgo_haspage) /* Does pager have page? */ __P((vm_pager_t, vm_offset_t)); - void (*pgo_cluster) /* Return range of cluster. */ - __P((vm_pager_t, vm_offset_t, - vm_offset_t *, vm_offset_t *)); }; +#define VM_PAGER_ALLOC(h, s, p, o) (*(pg)->pg_ops->pgo_alloc)(h, s, p, o) +#define VM_PAGER_DEALLOC(pg) (*(pg)->pg_ops->pgo_dealloc)(pg) +#define VM_PAGER_GET(pg, m, s) (*(pg)->pg_ops->pgo_getpage)(pg, m, s) +#define VM_PAGER_GET_MULTI(pg, m, c, r, s) (*(pg)->pg_ops->pgo_getpages)(pg, m, c, r, s) +#define VM_PAGER_PUT(pg, m, s) (*(pg)->pg_ops->pgo_putpage)(pg, m, s) +#define VM_PAGER_PUT_MULTI(pg, m, c, s, rtval) (*(pg)->pg_ops->pgo_putpages)(pg, m, c, s, rtval) +#define VM_PAGER_HASPAGE(pg, o) (*(pg)->pg_ops->pgo_haspage)(pg, o) + /* * get/put return values * OK operation was successful @@ -107,21 +117,15 @@ extern struct pagerops *dfltpagerops; vm_pager_t vm_pager_allocate __P((int, caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); vm_page_t vm_pager_atop __P((vm_offset_t)); -void vm_pager_cluster - __P((vm_pager_t, vm_offset_t, - vm_offset_t *, vm_offset_t *)); -void vm_pager_clusternull - __P((vm_pager_t, vm_offset_t, - vm_offset_t *, vm_offset_t *)); void vm_pager_deallocate __P((vm_pager_t)); int vm_pager_get_pages - __P((vm_pager_t, vm_page_t *, int, boolean_t)); + __P((vm_pager_t, vm_page_t *, int, int, boolean_t)); boolean_t vm_pager_has_page __P((vm_pager_t, vm_offset_t)); void vm_pager_init __P((void)); vm_pager_t vm_pager_lookup __P((struct pagerlst *, caddr_t)); vm_offset_t vm_pager_map_pages __P((vm_page_t *, int, boolean_t)); int vm_pager_put_pages - __P((vm_pager_t, vm_page_t *, int, boolean_t)); + __P((vm_pager_t, vm_page_t *, int, boolean_t, int *)); void vm_pager_sync __P((void)); void vm_pager_unmap_pages __P((vm_offset_t, int)); @@ -134,13 +138,16 @@ void vm_pager_unmap_pages __P((vm_offset_t, int)); ({ \ vm_page_t ml[1]; \ ml[0] = (m); \ - vm_pager_get_pages(p, ml, 1, s); \ + vm_pager_get_pages(p, ml, 1, 0, s); \ }) + #define vm_pager_put(p, m, s) \ ({ \ + int rtval; \ vm_page_t ml[1]; \ ml[0] = (m); \ - vm_pager_put_pages(p, ml, 1, s); \ + vm_pager_put_pages(p, ml, 1, s, &rtval); \ + rtval; \ }) #endif diff --git a/sys/vm/vm_param.h b/sys/vm/vm_param.h index 2d2c715..4a785ce 100644 --- a/sys/vm/vm_param.h +++ b/sys/vm/vm_param.h @@ -84,14 +84,25 @@ typedef int boolean_t; */ #define DEFAULT_PAGE_SIZE 4096 +#if 0 + /* * All references to the size of a page should be done with PAGE_SIZE * or PAGE_SHIFT. The fact they are variables is hidden here so that * we can easily make them constant if we so desire. */ +#ifndef PAGE_SIZE #define PAGE_SIZE cnt.v_page_size /* size of page */ +#endif +#ifndef PAGE_MASK #define PAGE_MASK page_mask /* size of page - 1 */ +#endif +#ifndef PAGE_SHIFT #define PAGE_SHIFT page_shift /* bits to shift for pages */ +#endif + +#endif + #ifdef KERNEL extern vm_size_t page_mask; extern int page_shift; @@ -129,17 +140,34 @@ extern int page_shift; * No rounding is used. */ #ifdef KERNEL + +#if 0 + +#ifndef atop #define atop(x) (((unsigned)(x)) >> PAGE_SHIFT) +#endif +#ifndef ptoa #define ptoa(x) ((vm_offset_t)((x) << PAGE_SHIFT)) +#endif /* * Round off or truncate to the nearest page. These will work * for either addresses or counts (i.e., 1 byte rounds to 1 page). */ +#ifndef round_page #define round_page(x) \ ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) & ~PAGE_MASK)) +#endif +#ifndef trunc_page #define trunc_page(x) \ ((vm_offset_t)(((vm_offset_t)(x)) & ~PAGE_MASK)) +#endif +#ifndef num_pages +#define num_pages(x) \ + ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) >> PAGE_SHIFT)) +#endif + +#endif #define num_pages(x) \ ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) >> PAGE_SHIFT)) @@ -148,11 +176,13 @@ extern vm_offset_t first_addr; /* first physical page */ extern vm_offset_t last_addr; /* last physical page */ #else +#if 0 /* out-of-kernel versions of round_page and trunc_page */ #define round_page(x) \ ((((vm_offset_t)(x) + (vm_page_size - 1)) / vm_page_size) * vm_page_size) #define trunc_page(x) \ ((((vm_offset_t)(x)) / vm_page_size) * vm_page_size) +#endif #endif /* KERNEL */ #endif /* ASSEMBLER */ diff --git a/sys/vm/vm_prot.h b/sys/vm/vm_prot.h index b3bae43..ee009bc 100644 --- a/sys/vm/vm_prot.h +++ b/sys/vm/vm_prot.h @@ -75,7 +75,7 @@ * vm_prot_t VM protection values. */ -typedef int vm_prot_t; +typedef u_char vm_prot_t; /* * Protection values, defined as bits within the vm_prot_t type diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c index 10b7523..5008a09 100644 --- a/sys/vm/vm_swap.c +++ b/sys/vm/vm_swap.c @@ -51,6 +51,7 @@ */ int nswap, nswdev; +int vm_swap_size; #ifdef SEQSWAP int niswdev; /* number of interleaved swap devices */ int niswap; /* size of interleaved swap area */ @@ -143,9 +144,8 @@ swapinit() /* * Now set up swap buffer headers. */ - bswlist.b_actf = sp; for (i = 0; i < nswbuf - 1; i++, sp++) { - sp->b_actf = sp + 1; + TAILQ_INSERT_HEAD(&bswlist, sp, b_freelist); sp->b_rcred = sp->b_wcred = p->p_ucred; sp->b_vnbufs.le_next = NOLIST; } @@ -390,12 +390,18 @@ swfree(p, index) blk = niswap; for (swp = &swdevt[niswdev]; swp != sp; swp++) blk += swp->sw_nblks; +#if 0 rmfree(swapmap, nblks, blk); return (0); +#endif + rlist_free(&swapmap, blk, blk + nblks - 1); + vm_swap_size += nblks; + return (0); } #endif for (dvbase = 0; dvbase < nblks; dvbase += dmmax) { blk = nblks - dvbase; + #ifdef SEQSWAP if ((vsbase = index*dmmax + dvbase*niswdev) >= niswap) panic("swfree"); @@ -405,6 +411,7 @@ swfree(p, index) #endif if (blk > dmmax) blk = dmmax; +#if 0 if (vsbase == 0) { /* * First of all chunks... initialize the swapmap. @@ -422,6 +429,11 @@ swfree(p, index) vsbase + ctod(CLSIZE)); } else rmfree(swapmap, blk, vsbase); +#endif + /* XXX -- we need to exclude the first cluster as above */ + /* but for now, this will work fine... */ + rlist_free(&swapmap, vsbase, vsbase + blk - 1); + vm_swap_size += blk; } return (0); } diff --git a/sys/vm/vm_unix.c b/sys/vm/vm_unix.c index 3d49ea7..ee6ddf6 100644 --- a/sys/vm/vm_unix.c +++ b/sys/vm/vm_unix.c @@ -50,9 +50,12 @@ #include <vm/vm.h> +extern int swap_pager_full; + struct obreak_args { char *nsiz; }; + /* ARGSUSED */ int obreak(p, uap, retval) @@ -72,9 +75,11 @@ obreak(p, uap, retval) old = round_page(old + ctob(vm->vm_dsize)); diff = new - old; if (diff > 0) { + if (swap_pager_full) { + return(ENOMEM); + } rv = vm_allocate(&vm->vm_map, &old, diff, FALSE); if (rv != KERN_SUCCESS) { - uprintf("sbrk: grow failed, return = %d\n", rv); return(ENOMEM); } vm->vm_dsize += btoc(diff); @@ -82,7 +87,6 @@ obreak(p, uap, retval) diff = -diff; rv = vm_deallocate(&vm->vm_map, new, diff); if (rv != KERN_SUCCESS) { - uprintf("sbrk: shrink failed, return = %d\n", rv); return(ENOMEM); } vm->vm_dsize -= btoc(diff); @@ -90,41 +94,10 @@ obreak(p, uap, retval) return(0); } -/* - * Enlarge the "stack segment" to include the specified - * stack pointer for the process. - */ -int -grow(p, sp) - struct proc *p; - unsigned sp; -{ - register struct vmspace *vm = p->p_vmspace; - register int si; - - /* - * For user defined stacks (from sendsig). - */ - if (sp < (unsigned)vm->vm_maxsaddr) - return (0); - /* - * For common case of already allocated (from trap). - */ - if (sp >= USRSTACK - ctob(vm->vm_ssize)) - return (1); - /* - * Really need to check vs limit and increment stack size if ok. - */ - si = clrnd(btoc(USRSTACK-sp) - vm->vm_ssize); - if (vm->vm_ssize + si > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) - return (0); - vm->vm_ssize += si; - return (1); -} - struct ovadvise_args { int anom; }; + /* ARGSUSED */ int ovadvise(p, uap, retval) diff --git a/sys/vm/vm_user.c b/sys/vm/vm_user.c index 20172c6..0f2c234 100644 --- a/sys/vm/vm_user.c +++ b/sys/vm/vm_user.c @@ -168,6 +168,7 @@ svm_protect(p, uap, retval) return((int)rv); } +#endif /* * vm_inherit sets the inheritence of the specified range in the * specified map. @@ -203,7 +204,6 @@ vm_protect(map, start, size, set_maximum, new_protection) return(vm_map_protect(map, trunc_page(start), round_page(start+size), new_protection, set_maximum)); } -#endif /* * vm_allocate allocates "zero fill" memory in the specfied @@ -255,6 +255,7 @@ vm_deallocate(map, start, size) return(vm_map_remove(map, trunc_page(start), round_page(start+size))); } +#if 1 /* * Similar to vm_allocate but assigns an explicit pager. */ @@ -310,3 +311,4 @@ vm_allocate_with_pager(map, addr, size, anywhere, pager, poffset, internal) vm_object_setpager(object, pager, (vm_offset_t) 0, TRUE); return(result); } +#endif diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 9c2f826..b8e5a19 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -1,7 +1,8 @@ /* * Copyright (c) 1990 University of Utah. - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 1993,1994 John S. Dyson * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer @@ -35,7 +36,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vnode_pager.c 8.8 (Berkeley) 2/13/94 + * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 + * $Id: vnode_pager.c,v 1.17 1994/04/05 03:23:53 davidg Exp $ */ /* @@ -46,6 +48,24 @@ * fix credential use (uses current process credentials now) */ +/* + * MODIFICATIONS: + * John S. Dyson 08 Dec 93 + * + * This file in conjunction with some vm_fault mods, eliminate the performance + * advantage for using the buffer cache and minimize memory copies. + * + * 1) Supports multiple - block reads + * 2) Bypasses buffer cache for reads + * + * TODO: + * + * 1) Totally bypass buffer cache for reads + * (Currently will still sometimes use buffer cache for reads) + * 2) Bypass buffer cache for writes + * (Code does not support it, but mods are simple) + */ + #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> @@ -58,51 +78,44 @@ #include <vm/vm_page.h> #include <vm/vnode_pager.h> -struct pagerlst vnode_pager_list; /* list of managed vnodes */ +#include <sys/buf.h> +#include <miscfs/specfs/specdev.h> -#ifdef DEBUG -int vpagerdebug = 0x00; -#define VDB_FOLLOW 0x01 -#define VDB_INIT 0x02 -#define VDB_IO 0x04 -#define VDB_FAIL 0x08 -#define VDB_ALLOC 0x10 -#define VDB_SIZE 0x20 -#endif +int vnode_pager_putmulti(); -static vm_pager_t vnode_pager_alloc - __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); -static void vnode_pager_cluster - __P((vm_pager_t, vm_offset_t, - vm_offset_t *, vm_offset_t *)); -static void vnode_pager_dealloc __P((vm_pager_t)); -static int vnode_pager_getpage - __P((vm_pager_t, vm_page_t *, int, boolean_t)); -static boolean_t vnode_pager_haspage __P((vm_pager_t, vm_offset_t)); -static void vnode_pager_init __P((void)); -static int vnode_pager_io - __P((vn_pager_t, vm_page_t *, int, - boolean_t, enum uio_rw)); -static boolean_t vnode_pager_putpage - __P((vm_pager_t, vm_page_t *, int, boolean_t)); +void vnode_pager_init(); +vm_pager_t vnode_pager_alloc(caddr_t, vm_offset_t, vm_prot_t, vm_offset_t); +void vnode_pager_dealloc(); +int vnode_pager_getpage(); +int vnode_pager_getmulti(); +int vnode_pager_putpage(); +boolean_t vnode_pager_haspage(); struct pagerops vnodepagerops = { vnode_pager_init, vnode_pager_alloc, vnode_pager_dealloc, vnode_pager_getpage, + vnode_pager_getmulti, vnode_pager_putpage, - vnode_pager_haspage, - vnode_pager_cluster + vnode_pager_putmulti, + vnode_pager_haspage }; -static void +static int vnode_pager_input(vn_pager_t vnp, vm_page_t *m, int count, int reqpage); +static int vnode_pager_output(vn_pager_t vnp, vm_page_t *m, int count, int *rtvals); +struct buf * getpbuf() ; +void relpbuf(struct buf *bp) ; + +extern vm_map_t pager_map; + +struct pagerlst vnode_pager_list; /* list of managed vnodes */ + +#define MAXBP (PAGE_SIZE/DEV_BSIZE); + +void vnode_pager_init() { -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("vnode_pager_init()\n"); -#endif TAILQ_INIT(&vnode_pager_list); } @@ -110,12 +123,12 @@ vnode_pager_init() * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ -static vm_pager_t -vnode_pager_alloc(handle, size, prot, foff) +vm_pager_t +vnode_pager_alloc(handle, size, prot, offset) caddr_t handle; vm_size_t size; vm_prot_t prot; - vm_offset_t foff; + vm_offset_t offset; { register vm_pager_t pager; register vn_pager_t vnp; @@ -124,10 +137,6 @@ vnode_pager_alloc(handle, size, prot, foff) struct vnode *vp; struct proc *p = curproc; /* XXX */ -#ifdef DEBUG - if (vpagerdebug & (VDB_FOLLOW|VDB_ALLOC)) - printf("vnode_pager_alloc(%x, %x, %x)\n", handle, size, prot); -#endif /* * Pageout to vnode, no can do yet. */ @@ -171,12 +180,12 @@ vnode_pager_alloc(handle, size, prot, foff) vnp->vnp_flags = 0; vnp->vnp_vp = vp; vnp->vnp_size = vattr.va_size; + TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list); pager->pg_handle = handle; pager->pg_type = PG_VNODE; - pager->pg_flags = 0; pager->pg_ops = &vnodepagerops; - pager->pg_data = vnp; + pager->pg_data = (caddr_t)vnp; vp->v_vmdata = (caddr_t)pager; } else { /* @@ -184,121 +193,104 @@ vnode_pager_alloc(handle, size, prot, foff) * cache if found and also gain a reference to the object. */ object = vm_object_lookup(pager); -#ifdef DEBUG - vnp = (vn_pager_t)pager->pg_data; -#endif } -#ifdef DEBUG - if (vpagerdebug & VDB_ALLOC) - printf("vnode_pager_setup: vp %x sz %x pager %x object %x\n", - vp, vnp->vnp_size, pager, object); -#endif return(pager); } -static void +void vnode_pager_dealloc(pager) vm_pager_t pager; { register vn_pager_t vnp = (vn_pager_t)pager->pg_data; register struct vnode *vp; -#ifdef NOTDEF struct proc *p = curproc; /* XXX */ -#endif -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("vnode_pager_dealloc(%x)\n", pager); -#endif if (vp = vnp->vnp_vp) { vp->v_vmdata = NULL; vp->v_flag &= ~VTEXT; -#if NOTDEF +#if 0 /* can hang if done at reboot on NFS FS */ (void) VOP_FSYNC(vp, p->p_ucred, p); #endif vrele(vp); } + TAILQ_REMOVE(&vnode_pager_list, pager, pg_list); free((caddr_t)vnp, M_VMPGDATA); free((caddr_t)pager, M_VMPAGER); } -static int -vnode_pager_getpage(pager, mlist, npages, sync) +int +vnode_pager_getmulti(pager, m, count, reqpage, sync) + vm_pager_t pager; + vm_page_t *m; + int count; + int reqpage; + boolean_t sync; +{ + + return vnode_pager_input((vn_pager_t) pager->pg_data, m, count, reqpage); +} + +int +vnode_pager_getpage(pager, m, sync) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t m; boolean_t sync; { -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("vnode_pager_getpage(%x, %x, %x, %x)\n", - pager, mlist, npages, sync); -#endif - return(vnode_pager_io((vn_pager_t)pager->pg_data, - mlist, npages, sync, UIO_READ)); + int err; + vm_page_t marray[1]; + if (pager == NULL) + return FALSE; + marray[0] = m; + + return vnode_pager_input((vn_pager_t)pager->pg_data, marray, 1, 0); } -static boolean_t -vnode_pager_putpage(pager, mlist, npages, sync) +boolean_t +vnode_pager_putpage(pager, m, sync) vm_pager_t pager; - vm_page_t *mlist; - int npages; + vm_page_t m; boolean_t sync; { int err; + vm_page_t marray[1]; + int rtvals[1]; -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("vnode_pager_putpage(%x, %x, %x, %x)\n", - pager, mlist, npages, sync); -#endif if (pager == NULL) - return (FALSE); /* ??? */ - err = vnode_pager_io((vn_pager_t)pager->pg_data, - mlist, npages, sync, UIO_WRITE); - /* - * If the operation was successful, mark the pages clean. - */ - if (err == VM_PAGER_OK) { - while (npages--) { - (*mlist)->flags |= PG_CLEAN; - pmap_clear_modify(VM_PAGE_TO_PHYS(*mlist)); - mlist++; - } - } - return(err); + return FALSE; + marray[0] = m; + vnode_pager_output((vn_pager_t)pager->pg_data, marray, 1, rtvals); + return rtvals[0]; +} + +int +vnode_pager_putmulti(pager, m, c, sync, rtvals) + vm_pager_t pager; + vm_page_t *m; + int c; + boolean_t sync; + int *rtvals; +{ + return vnode_pager_output((vn_pager_t)pager->pg_data, m, c, rtvals); } -static boolean_t + +boolean_t vnode_pager_haspage(pager, offset) vm_pager_t pager; vm_offset_t offset; { register vn_pager_t vnp = (vn_pager_t)pager->pg_data; daddr_t bn; + int run; int err; -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("vnode_pager_haspage(%x, %x)\n", pager, offset); -#endif - /* * Offset beyond end of file, do not have the page - * Lock the vnode first to make sure we have the most recent - * version of the size. */ - VOP_LOCK(vnp->vnp_vp); if (offset >= vnp->vnp_size) { - VOP_UNLOCK(vnp->vnp_vp); -#ifdef DEBUG - if (vpagerdebug & (VDB_FAIL|VDB_SIZE)) - printf("vnode_pager_haspage: pg %x, off %x, size %x\n", - pager, offset, vnp->vnp_size); -#endif return(FALSE); } @@ -311,53 +303,14 @@ vnode_pager_haspage(pager, offset) */ err = VOP_BMAP(vnp->vnp_vp, offset / vnp->vnp_vp->v_mount->mnt_stat.f_iosize, - (struct vnode **)0, &bn, NULL); - VOP_UNLOCK(vnp->vnp_vp); + (struct vnode **)0, &bn, 0); if (err) { -#ifdef DEBUG - if (vpagerdebug & VDB_FAIL) - printf("vnode_pager_haspage: BMAP err %d, pg %x, off %x\n", - err, pager, offset); -#endif return(TRUE); } return((long)bn < 0 ? FALSE : TRUE); } -static void -vnode_pager_cluster(pager, offset, loffset, hoffset) - vm_pager_t pager; - vm_offset_t offset; - vm_offset_t *loffset; - vm_offset_t *hoffset; -{ - vn_pager_t vnp = (vn_pager_t)pager->pg_data; - vm_offset_t loff, hoff; - -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("vnode_pager_cluster(%x, %x) ", pager, offset); -#endif - loff = offset; - if (loff >= vnp->vnp_size) - panic("vnode_pager_cluster: bad offset"); - /* - * XXX could use VOP_BMAP to get maxcontig value - */ - hoff = loff + MAXBSIZE; - if (hoff > round_page(vnp->vnp_size)) - hoff = round_page(vnp->vnp_size); - - *loffset = loff; - *hoffset = hoff; -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("returns [%x-%x]\n", loff, hoff); -#endif -} - /* - * (XXX) * Lets the VM system know about a change in size for a file. * If this vnode is mapped into some address space (i.e. we have a pager * for it) we adjust our own internal size and flush any cached pages in @@ -399,19 +352,14 @@ vnode_pager_setsize(vp, nsize) if (object == NULL) return; -#ifdef DEBUG - if (vpagerdebug & (VDB_FOLLOW|VDB_SIZE)) - printf("vnode_pager_setsize: vp %x obj %x osz %d nsz %d\n", - vp, object, vnp->vnp_size, nsize); -#endif /* * File has shrunk. * Toss any cached pages beyond the new EOF. */ - if (nsize < vnp->vnp_size) { + if (round_page(nsize) < round_page(vnp->vnp_size)) { vm_object_lock(object); vm_object_page_remove(object, - (vm_offset_t)nsize, vnp->vnp_size); + (vm_offset_t)round_page(nsize), round_page(vnp->vnp_size)); vm_object_unlock(object); } vnp->vnp_size = (vm_offset_t)nsize; @@ -425,24 +373,67 @@ vnode_pager_umount(mp) register vm_pager_t pager, npager; struct vnode *vp; - for (pager = vnode_pager_list.tqh_first; pager != NULL; pager = npager){ + pager = vnode_pager_list.tqh_first; + while( pager) { /* * Save the next pointer now since uncaching may * terminate the object and render pager invalid */ - npager = pager->pg_list.tqe_next; vp = ((vn_pager_t)pager->pg_data)->vnp_vp; - if (mp == (struct mount *)0 || vp->v_mount == mp) { - VOP_LOCK(vp); + npager = pager->pg_list.tqe_next; + if (mp == (struct mount *)0 || vp->v_mount == mp) (void) vnode_pager_uncache(vp); - VOP_UNLOCK(vp); - } + pager = npager; } } /* * Remove vnode associated object from the object cache. * + * Note: this routine may be invoked as a result of a pager put + * operation (possibly at object termination time), so we must be careful. + */ +boolean_t +vnode_pager_uncache(vp) + register struct vnode *vp; +{ + register vm_object_t object; + boolean_t uncached, locked; + vm_pager_t pager; + + /* + * Not a mapped vnode + */ + pager = (vm_pager_t)vp->v_vmdata; + if (pager == NULL) + return (TRUE); + /* + * Unlock the vnode if it is currently locked. + * We do this since uncaching the object may result + * in its destruction which may initiate paging + * activity which may necessitate locking the vnode. + */ + locked = VOP_ISLOCKED(vp); + if (locked) + VOP_UNLOCK(vp); + /* + * Must use vm_object_lookup() as it actually removes + * the object from the cache list. + */ + object = vm_object_lookup(pager); + if (object) { + uncached = (object->ref_count <= 1); + pager_cache(object, FALSE); + } else + uncached = TRUE; + if (locked) + VOP_LOCK(vp); + return(uncached); +} +#if 0 +/* + * Remove vnode associated object from the object cache. + * * XXX unlock the vnode if it is currently locked. * We must do this since uncaching the object may result in its * destruction which may initiate paging activity which may necessitate @@ -462,14 +453,6 @@ vnode_pager_uncache(vp) pager = (vm_pager_t)vp->v_vmdata; if (pager == NULL) return (TRUE); -#ifdef DEBUG - if (!VOP_ISLOCKED(vp)) { - extern int (**nfsv2_vnodeop_p)(); - - if (vp->v_op != nfsv2_vnodeop_p) - panic("vnode_pager_uncache: vnode not locked!"); - } -#endif /* * Must use vm_object_lookup() as it actually removes * the object from the cache list. @@ -484,97 +467,958 @@ vnode_pager_uncache(vp) uncached = TRUE; return(uncached); } +#endif -static int -vnode_pager_io(vnp, mlist, npages, sync, rw) - register vn_pager_t vnp; - vm_page_t *mlist; - int npages; - boolean_t sync; - enum uio_rw rw; + +void +vnode_pager_freepage(m) + vm_page_t m; +{ + PAGE_WAKEUP(m); + vm_page_free(m); +} + +/* + * calculate the linear (byte) disk address of specified virtual + * file address + */ +vm_offset_t +vnode_pager_addr(vp, address) + struct vnode *vp; + vm_offset_t address; +{ + int rtaddress; + int bsize; + vm_offset_t block; + struct vnode *rtvp; + int err; + int vblock, voffset; + int run; + + bsize = vp->v_mount->mnt_stat.f_iosize; + vblock = address / bsize; + voffset = address % bsize; + + err = VOP_BMAP(vp,vblock,&rtvp,&block,0); + + if( err) + rtaddress = -1; + else + rtaddress = block * DEV_BSIZE + voffset; + + return rtaddress; +} + +/* + * interrupt routine for I/O completion + */ +void +vnode_pager_iodone(bp) + struct buf *bp; { + bp->b_flags |= B_DONE; + wakeup((caddr_t)bp); +} + +/* + * small block file system vnode pager input + */ +int +vnode_pager_input_smlfs(vnp, m) + vn_pager_t vnp; + vm_page_t m; +{ + int i; + int s; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + struct buf *bp; + vm_offset_t mapsize; + vm_offset_t foff; + vm_offset_t kva; + int fileaddr; + int block; + vm_offset_t bsize; + int error = 0; + int run; + + paging_offset = m->object->paging_offset; + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + foff = m->offset + paging_offset; + + VOP_BMAP(vp, foff, &dp, 0, 0); + + kva = vm_pager_map_page(m); + + for(i=0;i<PAGE_SIZE/bsize;i++) { + /* + * calculate logical block and offset + */ + block = foff / bsize + i; + s = splbio(); + while (bp = incore(vp, block)) { + int amount; + + /* + * wait until the buffer is avail or gone + */ + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep ((caddr_t)bp, PVM, "vnwblk", 0); + continue; + } + + amount = bsize; + if ((foff + bsize) > vnp->vnp_size) + amount = vnp->vnp_size - foff; + + /* + * make sure that this page is in the buffer + */ + if ((amount > 0) && amount <= bp->b_bcount) { + bp->b_flags |= B_BUSY; + splx(s); + + /* + * copy the data from the buffer + */ + bcopy(bp->b_un.b_addr, (caddr_t)kva + i * bsize, amount); + if (amount < bsize) { + bzero((caddr_t)kva + amount, bsize - amount); + } + bp->b_flags &= ~B_BUSY; + wakeup((caddr_t)bp); + goto nextblock; + } + break; + } + splx(s); + fileaddr = vnode_pager_addr(vp, foff + i * bsize); + if( fileaddr != -1) { + bp = getpbuf(); + VHOLD(vp); + + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_READ | B_CALL; + bp->b_iodone = vnode_pager_iodone; + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva + i * bsize; + bp->b_blkno = fileaddr / DEV_BSIZE; + bgetvp(dp, bp); + bp->b_bcount = bsize; + bp->b_bufsize = bsize; + + /* do the input */ + VOP_STRATEGY(bp); + + /* we definitely need to be at splbio here */ + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnsrd", 0); + } + splx(s); + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + if( error) + break; + } else { + bzero((caddr_t) kva + i * bsize, bsize); + } +nextblock: + } + vm_pager_unmap_page(kva); + if( error) { + return VM_PAGER_FAIL; + } + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + m->flags |= PG_CLEAN; + m->flags &= ~PG_LAUNDRY; + return VM_PAGER_OK; + +} + + +/* + * old style vnode pager output routine + */ +int +vnode_pager_input_old(vnp, m) + vn_pager_t vnp; + vm_page_t m; +{ + int i; struct uio auio; struct iovec aiov; + int error; + int size; + vm_offset_t foff; + vm_offset_t kva; + + error = 0; + foff = m->offset + m->object->paging_offset; + /* + * Return failure if beyond current EOF + */ + if (foff >= vnp->vnp_size) { + return VM_PAGER_BAD; + } else { + size = PAGE_SIZE; + if (foff + size > vnp->vnp_size) + size = vnp->vnp_size - foff; +/* + * Allocate a kernel virtual address and initialize so that + * we can use VOP_READ/WRITE routines. + */ + kva = vm_pager_map_page(m); + aiov.iov_base = (caddr_t)kva; + aiov.iov_len = size; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = foff; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_resid = size; + auio.uio_procp = (struct proc *)0; + + error = VOP_READ(vnp->vnp_vp, &auio, 0, curproc->p_ucred); + if (!error) { + register int count = size - auio.uio_resid; + + if (count == 0) + error = EINVAL; + else if (count != PAGE_SIZE) + bzero((caddr_t)kva + count, PAGE_SIZE - count); + } + vm_pager_unmap_page(kva); + } + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + m->flags |= PG_CLEAN; + m->flags &= ~PG_LAUNDRY; + return error?VM_PAGER_FAIL:VM_PAGER_OK; +} + +/* + * generic vnode pager input routine + */ +int +vnode_pager_input(vnp, m, count, reqpage) + register vn_pager_t vnp; + vm_page_t *m; + int count, reqpage; +{ + int i,j; vm_offset_t kva, foff; - int error, size; + int size; struct proc *p = curproc; /* XXX */ + vm_object_t object; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + vm_offset_t mapsize; + int bsize; + + int first, last; + int reqaddr, firstaddr; + int run; + int block, offset; + + int nbp; + struct buf *bp; + int s; + int failflag; + + int errtype=0; /* 0 is file type otherwise vm type */ + int error = 0; + + object = m[reqpage]->object; /* all vm_page_t items are in same object */ + paging_offset = object->paging_offset; + + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + + /* get the UNDERLYING device for the file with VOP_BMAP() */ + /* + * originally, we did not check for an error return + * value -- assuming an fs always has a bmap entry point + * -- that assumption is wrong!!! + */ + kva = 0; + mapsize = 0; + foff = m[reqpage]->offset + paging_offset; + if (!VOP_BMAP(vp, foff, &dp, 0, 0)) { + /* + * we do not block for a kva, notice we default to a kva + * conservative behavior + */ + kva = kmem_alloc_pageable(pager_map, (mapsize = count*PAGE_SIZE)); + if( !kva) { + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + m[0] = m[reqpage]; + kva = kmem_alloc_wait(pager_map, mapsize = PAGE_SIZE); + reqpage = 0; + count = 1; + } + } - /* XXX */ - vm_page_t m; - if (npages != 1) - panic("vnode_pager_io: cannot handle multiple pages"); - m = *mlist; - /* XXX */ - -#ifdef DEBUG - if (vpagerdebug & VDB_FOLLOW) - printf("vnode_pager_io(%x, %x, %c): vnode %x\n", - vnp, m, rw == UIO_READ ? 'R' : 'W', vnp->vnp_vp); -#endif - foff = m->offset + m->object->paging_offset; /* - * Allocate a kernel virtual address and initialize so that - * we can use VOP_READ/WRITE routines. + * if we can't get a kva or we can't bmap, use old VOP code */ - kva = vm_pager_map_pages(mlist, npages, sync); - if (kva == NULL) - return(VM_PAGER_AGAIN); + if (!kva) { + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + return vnode_pager_input_old(vnp, m[reqpage]); /* - * After all of the potentially blocking operations have been - * performed, we can do the size checks: - * read beyond EOF (returns error) - * short read + * if the blocksize is smaller than a page size, then use + * special small filesystem code. NFS sometimes has a small + * blocksize, but it can handle large reads itself. */ - VOP_LOCK(vnp->vnp_vp); - if (foff >= vnp->vnp_size) { - VOP_UNLOCK(vnp->vnp_vp); - vm_pager_unmap_pages(kva, npages); -#ifdef DEBUG - if (vpagerdebug & VDB_SIZE) - printf("vnode_pager_io: vp %x, off %d size %d\n", - vnp->vnp_vp, foff, vnp->vnp_size); -#endif - return(VM_PAGER_BAD); + } else if( (PAGE_SIZE / bsize) > 1 && + (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { + + kmem_free_wakeup(pager_map, kva, mapsize); + + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + return vnode_pager_input_smlfs(vnp, m[reqpage]); + } + +/* + * here on direct device I/O + */ + + + /* + * This pathetic hack gets data from the buffer cache, if it's there. + * I believe that this is not really necessary, and the ends can + * be gotten by defaulting to the normal vfs read behavior, but this + * might be more efficient, because the will NOT invoke read-aheads + * and one of the purposes of this code is to bypass the buffer + * cache and keep from flushing it by reading in a program. + */ + /* + * calculate logical block and offset + */ + block = foff / bsize; + offset = foff % bsize; + s = splbio(); + + /* + * if we have a buffer in core, then try to use it + */ + while (bp = incore(vp, block)) { + int amount; + + /* + * wait until the buffer is avail or gone + */ + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep ((caddr_t)bp, PVM, "vnwblk", 0); + continue; + } + + amount = PAGE_SIZE; + if ((foff + amount) > vnp->vnp_size) + amount = vnp->vnp_size - foff; + + /* + * make sure that this page is in the buffer + */ + if ((amount > 0) && (offset + amount) <= bp->b_bcount) { + bp->b_flags |= B_BUSY; + splx(s); + + /* + * map the requested page + */ + pmap_kenter(kva, VM_PAGE_TO_PHYS(m[reqpage])); + pmap_update(); + + /* + * copy the data from the buffer + */ + bcopy(bp->b_un.b_addr + offset, (caddr_t)kva, amount); + if (amount < PAGE_SIZE) { + bzero((caddr_t)kva + amount, PAGE_SIZE - amount); + } + /* + * unmap the page and free the kva + */ + pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE); + kmem_free_wakeup(pager_map, kva, mapsize); + /* + * release the buffer back to the block subsystem + */ + bp->b_flags &= ~B_BUSY; + wakeup((caddr_t)bp); + /* + * we did not have to do any work to get the requested + * page, the read behind/ahead does not justify a read + */ + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + count = 1; + reqpage = 0; + m[0] = m[reqpage]; + + /* + * sorry for the goto + */ + goto finishup; + } + /* + * buffer is nowhere to be found, read from the disk + */ + break; + } + splx(s); + + reqaddr = vnode_pager_addr(vp, foff); + s = splbio(); + /* + * Make sure that our I/O request is contiguous. + * Scan backward and stop for the first discontiguous + * entry or stop for a page being in buffer cache. + */ + failflag = 0; + first = reqpage; + for (i = reqpage - 1; i >= 0; --i) { + if (failflag || + incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || + (vnode_pager_addr(vp, m[i]->offset + paging_offset)) + != reqaddr + (i - reqpage) * PAGE_SIZE) { + vnode_pager_freepage(m[i]); + failflag = 1; + } else { + first = i; + } + } + + /* + * Scan forward and stop for the first non-contiguous + * entry or stop for a page being in buffer cache. + */ + failflag = 0; + last = reqpage + 1; + for (i = reqpage + 1; i < count; i++) { + if (failflag || + incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || + (vnode_pager_addr(vp, m[i]->offset + paging_offset)) + != reqaddr + (i - reqpage) * PAGE_SIZE) { + vnode_pager_freepage(m[i]); + failflag = 1; + } else { + last = i + 1; + } + } + splx(s); + + /* + * the first and last page have been calculated now, move input + * pages to be zero based... + */ + count = last; + if (first != 0) { + for (i = first; i < count; i++) { + m[i - first] = m[i]; + } + count -= first; + reqpage -= first; } - if (foff + PAGE_SIZE > vnp->vnp_size) + + /* + * calculate the file virtual address for the transfer + */ + foff = m[0]->offset + paging_offset; + /* + * and get the disk physical address (in bytes) + */ + firstaddr = vnode_pager_addr(vp, foff); + + /* + * calculate the size of the transfer + */ + size = count * PAGE_SIZE; + if ((foff + size) > vnp->vnp_size) size = vnp->vnp_size - foff; - else + + /* + * round up physical size for real devices + */ + if( dp->v_type == VBLK || dp->v_type == VCHR) + size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + /* + * and map the pages to be read into the kva + */ + for (i = 0; i < count; i++) + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + + pmap_update(); + bp = getpbuf(); + VHOLD(vp); + + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_READ | B_CALL; + bp->b_iodone = vnode_pager_iodone; + /* B_PHYS is not set, but it is nice to fill this in */ + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = firstaddr / DEV_BSIZE; + bgetvp(dp, bp); + bp->b_bcount = size; + bp->b_bufsize = size; + + /* do the input */ + VOP_STRATEGY(bp); + + s = splbio(); + /* we definitely need to be at splbio here */ + + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnread", 0); + } + splx(s); + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + if (!error) { + if (size != count * PAGE_SIZE) + bzero((caddr_t)kva + size, PAGE_SIZE * count - size); + } + + pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); + kmem_free_wakeup(pager_map, kva, mapsize); + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + +finishup: + for (i = 0; i < count; i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + if (i != reqpage) { + /* + * whether or not to leave the page activated + * is up in the air, but we should put the page + * on a page queue somewhere. (it already is in + * the object). + * Result: It appears that emperical results show + * that deactivating pages is best. + */ + /* + * just in case someone was asking for this + * page we now tell them that it is ok to use + */ + if (!error) { + vm_page_deactivate(m[i]); + PAGE_WAKEUP(m[i]); + m[i]->flags &= ~PG_FAKE; + m[i]->act_count = 2; + } else { + vnode_pager_freepage(m[i]); + } + } + } + if (error) { + printf("vnode pager read error: %d\n", error); + } + if (errtype) + return error; + return (error ? VM_PAGER_FAIL : VM_PAGER_OK); +} + +/* + * old-style vnode pager output routine + */ +int +vnode_pager_output_old(vnp, m) + register vn_pager_t vnp; + vm_page_t m; +{ + vm_offset_t foff; + vm_offset_t kva; + vm_offset_t size; + struct iovec aiov; + struct uio auio; + struct vnode *vp; + int error; + + vp = vnp->vnp_vp; + foff = m->offset + m->object->paging_offset; + /* + * Return failure if beyond current EOF + */ + if (foff >= vnp->vnp_size) { + return VM_PAGER_BAD; + } else { size = PAGE_SIZE; - aiov.iov_base = (caddr_t)kva; - aiov.iov_len = size; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = foff; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = rw; - auio.uio_resid = size; - auio.uio_procp = (struct proc *)0; -#ifdef DEBUG - if (vpagerdebug & VDB_IO) - printf("vnode_pager_io: vp %x kva %x foff %x size %x", - vnp->vnp_vp, kva, foff, size); -#endif - if (rw == UIO_READ) - error = VOP_READ(vnp->vnp_vp, &auio, 0, p->p_ucred); + if (foff + size > vnp->vnp_size) + size = vnp->vnp_size - foff; +/* + * Allocate a kernel virtual address and initialize so that + * we can use VOP_WRITE routines. + */ + kva = vm_pager_map_page(m); + aiov.iov_base = (caddr_t)kva; + aiov.iov_len = size; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = foff; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_resid = size; + auio.uio_procp = (struct proc *)0; + + error = VOP_WRITE(vp, &auio, 0, curproc->p_ucred); + + if (!error) { + if ((size - auio.uio_resid) == 0) { + error = EINVAL; + } + } + vm_pager_unmap_page(kva); + return error?VM_PAGER_FAIL:VM_PAGER_OK; + } +} + +/* + * vnode pager output on a small-block file system + */ +int +vnode_pager_output_smlfs(vnp, m) + vn_pager_t vnp; + vm_page_t m; +{ + int i; + int s; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + struct buf *bp; + vm_offset_t mapsize; + vm_offset_t foff; + vm_offset_t kva; + int fileaddr; + int block; + vm_offset_t bsize; + int run; + int error = 0; + + paging_offset = m->object->paging_offset; + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + foff = m->offset + paging_offset; + + VOP_BMAP(vp, foff, &dp, 0, 0); + kva = vm_pager_map_page(m); + for(i = 0; !error && i < (PAGE_SIZE/bsize); i++) { + /* + * calculate logical block and offset + */ + fileaddr = vnode_pager_addr(vp, foff + i * bsize); + if( fileaddr != -1) { + s = splbio(); + if( bp = incore( vp, (foff/bsize) + i)) { + bp = getblk(vp, (foff/bsize) + i, bp->b_bufsize,0, 0); + bp->b_flags |= B_INVAL; + brelse(bp); + } + splx(s); + + bp = getpbuf(); + VHOLD(vp); + + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_CALL | B_WRITE; + bp->b_iodone = vnode_pager_iodone; + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva + i * bsize; + bp->b_blkno = fileaddr / DEV_BSIZE; + bgetvp(dp, bp); + ++dp->v_numoutput; + /* for NFS */ + bp->b_dirtyoff = 0; + bp->b_dirtyend = bsize; + bp->b_bcount = bsize; + bp->b_bufsize = bsize; + + /* do the input */ + VOP_STRATEGY(bp); + + /* we definitely need to be at splbio here */ + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnswrt", 0); + } + splx(s); + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + } + } + vm_pager_unmap_page(kva); + if( error) + return VM_PAGER_FAIL; else - error = VOP_WRITE(vnp->vnp_vp, &auio, 0, p->p_ucred); - VOP_UNLOCK(vnp->vnp_vp); -#ifdef DEBUG - if (vpagerdebug & VDB_IO) { - if (error || auio.uio_resid) - printf(" returns error %x, resid %x", - error, auio.uio_resid); - printf("\n"); + return VM_PAGER_OK; +} + +/* + * generic vnode pager output routine + */ +int +vnode_pager_output(vnp, m, count, rtvals) + vn_pager_t vnp; + vm_page_t *m; + int count; + int *rtvals; +{ + int i,j; + vm_offset_t kva, foff; + int size; + struct proc *p = curproc; /* XXX */ + vm_object_t object; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + struct buf *bp; + vm_offset_t mapsize; + vm_offset_t reqaddr; + int run; + int bsize; + int s; + + int error = 0; + +retryoutput: + object = m[0]->object; /* all vm_page_t items are in same object */ + paging_offset = object->paging_offset; + + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + + for(i=0;i<count;i++) + rtvals[i] = VM_PAGER_AGAIN; + + /* + * if the filesystem does not have a bmap, then use the + * old code + */ + if (VOP_BMAP(vp, m[0]->offset+paging_offset, &dp, 0, 0)) { + + rtvals[0] = vnode_pager_output_old(vnp, m[0]); + + pmap_clear_modify(VM_PAGE_TO_PHYS(m[0])); + m[0]->flags |= PG_CLEAN; + m[0]->flags &= ~PG_LAUNDRY; + return rtvals[0]; } -#endif - if (!error) { - register int count = size - auio.uio_resid; - if (count == 0) - error = EINVAL; - else if (count != PAGE_SIZE && rw == UIO_READ) - bzero((void *)(kva + count), PAGE_SIZE - count); + /* + * if the filesystem has a small blocksize, then use + * the small block filesystem output code + */ + if ((bsize < PAGE_SIZE) && + (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { + + for(i=0;i<count;i++) { + rtvals[i] = vnode_pager_output_smlfs(vnp, m[i]); + if( rtvals[i] == VM_PAGER_OK) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + } + } + return rtvals[0]; } - vm_pager_unmap_pages(kva, npages); - return (error ? VM_PAGER_ERROR : VM_PAGER_OK); + + /* + * get some kva for the output + */ + kva = kmem_alloc_pageable(pager_map, (mapsize = count*PAGE_SIZE)); + if( !kva) { + kva = kmem_alloc_pageable(pager_map, (mapsize = PAGE_SIZE)); + count = 1; + if( !kva) + return rtvals[0]; + } + + for(i=0;i<count;i++) { + foff = m[i]->offset + paging_offset; + if (foff >= vnp->vnp_size) { + for(j=i;j<count;j++) + rtvals[j] = VM_PAGER_BAD; + count = i; + break; + } + } + if (count == 0) { + return rtvals[0]; + } + foff = m[0]->offset + paging_offset; + reqaddr = vnode_pager_addr(vp, foff); + /* + * Scan forward and stop for the first non-contiguous + * entry or stop for a page being in buffer cache. + */ + for (i = 1; i < count; i++) { + if ( vnode_pager_addr(vp, m[i]->offset + paging_offset) + != reqaddr + i * PAGE_SIZE) { + count = i; + break; + } + } + + /* + * calculate the size of the transfer + */ + size = count * PAGE_SIZE; + if ((foff + size) > vnp->vnp_size) + size = vnp->vnp_size - foff; + + /* + * round up physical size for real devices + */ + if( dp->v_type == VBLK || dp->v_type == VCHR) + size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + /* + * and map the pages to be read into the kva + */ + for (i = 0; i < count; i++) + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + pmap_update(); +/* + printf("vnode: writing foff: %d, devoff: %d, size: %d\n", + foff, reqaddr, size); +*/ + /* + * next invalidate the incore vfs_bio data + */ + for (i = 0; i < count; i++) { + int filblock = (foff + i * PAGE_SIZE) / bsize; + struct buf *fbp; + + s = splbio(); + if( fbp = incore( vp, filblock)) { + /* printf("invalidating: %d\n", filblock); */ + fbp = getblk(vp, filblock, fbp->b_bufsize,0,0); + fbp->b_flags |= B_INVAL; + brelse(fbp); + } + splx(s); + } + + + bp = getpbuf(); + VHOLD(vp); + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_WRITE | B_CALL; + bp->b_iodone = vnode_pager_iodone; + /* B_PHYS is not set, but it is nice to fill this in */ + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = reqaddr / DEV_BSIZE; + bgetvp(dp, bp); + ++dp->v_numoutput; + + /* for NFS */ + bp->b_dirtyoff = 0; + bp->b_dirtyend = size; + + bp->b_bcount = size; + bp->b_bufsize = size; + + /* do the output */ + VOP_STRATEGY(bp); + + s = splbio(); + + /* we definitely need to be at splbio here */ + + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnwrite", 0); + } + splx(s); + + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); + kmem_free_wakeup(pager_map, kva, mapsize); + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + + if( !error) { + for(i=0;i<count;i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + rtvals[i] = VM_PAGER_OK; + } + } else if( count != 1) { + error = 0; + count = 1; + goto retryoutput; + } + + if (error) { + printf("vnode pager write error: %d\n", error); + } + return (error ? VM_PAGER_FAIL : VM_PAGER_OK); } + diff --git a/sys/vm/vnode_pager.h b/sys/vm/vnode_pager.h index 95c9545..b01dc54 100644 --- a/sys/vm/vnode_pager.h +++ b/sys/vm/vnode_pager.h @@ -53,7 +53,4 @@ typedef struct vnpager *vn_pager_t; #define VN_PAGER_NULL ((vn_pager_t)0) -#define VNP_PAGING 0x01 /* vnode used for pageout */ -#define VNP_CACHED 0x02 /* vnode is cached */ - #endif /* _VNODE_PAGER_ */ |