diff options
Diffstat (limited to 'sys/vm')
34 files changed, 17060 insertions, 0 deletions
diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c new file mode 100644 index 0000000..b8083df --- /dev/null +++ b/sys/vm/device_pager.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)device_pager.c 8.1 (Berkeley) 6/11/93 + */ + +/* + * Page to/from special files. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/mman.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/device_pager.h> + +struct pagerlst dev_pager_list; /* list of managed devices */ +struct pglist dev_pager_fakelist; /* list of available vm_page_t's */ + +#ifdef DEBUG +int dpagerdebug = 0; +#define DDB_FOLLOW 0x01 +#define DDB_INIT 0x02 +#define DDB_ALLOC 0x04 +#define DDB_FAIL 0x08 +#endif + +static vm_pager_t dev_pager_alloc + __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); +static void dev_pager_dealloc __P((vm_pager_t)); +static int dev_pager_getpage + __P((vm_pager_t, vm_page_t, boolean_t)); +static boolean_t dev_pager_haspage __P((vm_pager_t, vm_offset_t)); +static void dev_pager_init __P((void)); +static int dev_pager_putpage + __P((vm_pager_t, vm_page_t, boolean_t)); +static vm_page_t dev_pager_getfake __P((vm_offset_t)); +static void dev_pager_putfake __P((vm_page_t)); + +struct pagerops devicepagerops = { + dev_pager_init, + dev_pager_alloc, + dev_pager_dealloc, + dev_pager_getpage, + 0, + dev_pager_putpage, + 0, + dev_pager_haspage +}; + +static void +dev_pager_init() +{ +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_init()\n"); +#endif + TAILQ_INIT(&dev_pager_list); + TAILQ_INIT(&dev_pager_fakelist); +} + +static vm_pager_t +dev_pager_alloc(handle, size, prot, foff) + caddr_t handle; + vm_size_t size; + vm_prot_t prot; + vm_offset_t foff; +{ + dev_t dev; + vm_pager_t pager; + int (*mapfunc)(); + vm_object_t object; + dev_pager_t devp; + unsigned int npages, off; + +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_alloc(%x, %x, %x, %x)\n", + handle, size, prot, foff); +#endif +#ifdef DIAGNOSTIC + /* + * Pageout to device, should never happen. + */ + if (handle == NULL) + panic("dev_pager_alloc called"); +#endif + + /* + * Make sure this device can be mapped. + */ + dev = (dev_t)(u_long)handle; + mapfunc = cdevsw[major(dev)].d_mmap; + if (mapfunc == NULL || mapfunc == enodev || mapfunc == nullop) + return(NULL); + + /* + * Offset should be page aligned. + */ + if (foff & (PAGE_SIZE-1)) + return(NULL); + + /* + * Check that the specified range of the device allows the + * desired protection. + * + * XXX assumes VM_PROT_* == PROT_* + */ + npages = atop(round_page(size)); + for (off = foff; npages--; off += PAGE_SIZE) + if ((*mapfunc)(dev, off, (int)prot) == -1) + return(NULL); + + /* + * Look up pager, creating as necessary. + */ +top: + pager = vm_pager_lookup(&dev_pager_list, handle); + if (pager == NULL) { + /* + * Allocate and initialize pager structs + */ + pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, M_WAITOK); + if (pager == NULL) + return(NULL); + devp = (dev_pager_t)malloc(sizeof *devp, M_VMPGDATA, M_WAITOK); + if (devp == NULL) { + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + pager->pg_handle = handle; + pager->pg_ops = &devicepagerops; + pager->pg_type = PG_DEVICE; + pager->pg_data = (caddr_t)devp; + pager->pg_flags = 0; + TAILQ_INIT(&devp->devp_pglist); + /* + * Allocate object and associate it with the pager. + */ + object = devp->devp_object = vm_object_allocate(0); + vm_object_enter(object, pager); + vm_object_setpager(object, pager, (vm_offset_t)foff, FALSE); + /* + * Finally, put it on the managed list so other can find it. + * First we re-lookup in case someone else beat us to this + * point (due to blocking in the various mallocs). If so, + * we free everything and start over. + */ + if (vm_pager_lookup(&dev_pager_list, handle)) { + free((caddr_t)devp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); + goto top; + } + TAILQ_INSERT_TAIL(&dev_pager_list, pager, pg_list); +#ifdef DEBUG + if (dpagerdebug & DDB_ALLOC) { + printf("dev_pager_alloc: pager %x devp %x object %x\n", + pager, devp, object); + vm_object_print(object, FALSE); + } +#endif + } else { + /* + * vm_object_lookup() gains a reference and also + * removes the object from the cache. + */ + object = vm_object_lookup(pager); +#ifdef DIAGNOSTIC + devp = (dev_pager_t)pager->pg_data; + if (object != devp->devp_object) + panic("dev_pager_setup: bad object"); +#endif + } + return(pager); +} + +static void +dev_pager_dealloc(pager) + vm_pager_t pager; +{ + dev_pager_t devp; + vm_object_t object; + vm_page_t m; + +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_dealloc(%x)\n", pager); +#endif + TAILQ_REMOVE(&dev_pager_list, pager, pg_list); + /* + * Get the object. + * Note: cannot use vm_object_lookup since object has already + * been removed from the hash chain. + */ + devp = (dev_pager_t)pager->pg_data; + object = devp->devp_object; +#ifdef DEBUG + if (dpagerdebug & DDB_ALLOC) + printf("dev_pager_dealloc: devp %x object %x\n", devp, object); +#endif + /* + * Free up our fake pages. + */ + while (m=devp->devp_pglist.tqh_first) { + TAILQ_REMOVE(&devp->devp_pglist, m, pageq); + dev_pager_putfake(m); + } + free((caddr_t)devp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); +} + +static int +dev_pager_getpage(pager, m, sync) + vm_pager_t pager; + vm_page_t m; + boolean_t sync; +{ + register vm_object_t object; + vm_offset_t offset, paddr; + vm_page_t page; + dev_t dev; + int s; + int (*mapfunc)(), prot; + +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_getpage(%x, %x)\n", pager, m); +#endif + + object = m->object; + dev = (dev_t)(u_long)pager->pg_handle; + offset = m->offset + object->paging_offset; + prot = PROT_READ; /* XXX should pass in? */ + mapfunc = cdevsw[major(dev)].d_mmap; + + if (mapfunc == NULL || mapfunc == enodev || mapfunc == nullop) + panic("dev_pager_getpage: no map function"); + + paddr = pmap_phys_address((*mapfunc)((dev_t)dev, (int)offset, prot)); +#ifdef DIAGNOSTIC + if (paddr == -1) + panic("dev_pager_getpage: map function returns error"); +#endif + /* + * Replace the passed in page with our own fake page and free + * up the original. + */ + page = dev_pager_getfake(paddr); + TAILQ_INSERT_TAIL(&((dev_pager_t)pager->pg_data)->devp_pglist, + page, pageq); + vm_object_lock(object); + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); + s = splhigh(); + vm_page_insert(page, object, offset); + splx(s); + PAGE_WAKEUP(m); + if (offset + PAGE_SIZE > object->size) + object->size = offset + PAGE_SIZE; /* XXX anal */ + vm_object_unlock(object); + + return(VM_PAGER_OK); +} + +static int +dev_pager_putpage(pager, m, sync) + vm_pager_t pager; + vm_page_t m; + boolean_t sync; +{ +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_putpage(%x, %x)\n", pager, m); +#endif + if (pager == NULL) + return 0; + panic("dev_pager_putpage called"); +} + +static boolean_t +dev_pager_haspage(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_haspage(%x, %x)\n", pager, offset); +#endif + return(TRUE); +} + +static vm_page_t +dev_pager_getfake(paddr) + vm_offset_t paddr; +{ + vm_page_t m; + int i; + + if (dev_pager_fakelist.tqh_first == NULL) { + m = (vm_page_t)malloc(PAGE_SIZE, M_VMPGDATA, M_WAITOK); + for (i = PAGE_SIZE / sizeof(*m); i > 0; i--) { + TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq); + m++; + } + } + m = dev_pager_fakelist.tqh_first; + TAILQ_REMOVE(&dev_pager_fakelist, m, pageq); + + m->flags = PG_BUSY | PG_CLEAN | PG_FAKE | PG_FICTITIOUS; + + m->wire_count = 1; + m->phys_addr = paddr; + + return(m); +} + +static void +dev_pager_putfake(m) + vm_page_t m; +{ +#ifdef DIAGNOSTIC + if (!(m->flags & PG_FICTITIOUS)) + panic("dev_pager_putfake: bad page"); +#endif + TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq); +} diff --git a/sys/vm/device_pager.h b/sys/vm/device_pager.h new file mode 100644 index 0000000..88406229 --- /dev/null +++ b/sys/vm/device_pager.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)device_pager.h 8.3 (Berkeley) 12/13/93 + */ + +#ifndef _DEVICE_PAGER_ +#define _DEVICE_PAGER_ 1 + +/* + * Device pager private data. + */ +struct devpager { + struct pglist devp_pglist; /* list of pages allocated */ + vm_object_t devp_object; /* object representing this device */ +}; +typedef struct devpager *dev_pager_t; + +#endif /* _DEVICE_PAGER_ */ diff --git a/sys/vm/kern_lock.c b/sys/vm/kern_lock.c new file mode 100644 index 0000000..c4fa052 --- /dev/null +++ b/sys/vm/kern_lock.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_lock.c 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Locking primitives implementation + */ + +#include <sys/param.h> +#include <sys/systm.h> + +#include <vm/vm.h> + +/* XXX */ +#include <sys/proc.h> +typedef int *thread_t; +#define current_thread() ((thread_t)&curproc->p_thread) +/* XXX */ + +#if NCPUS > 1 + +/* + * Module: lock + * Function: + * Provide reader/writer sychronization. + * Implementation: + * Simple interlock on a bit. Readers first interlock + * increment the reader count, then let go. Writers hold + * the interlock (thus preventing further readers), and + * wait for already-accepted readers to go away. + */ + +/* + * The simple-lock routines are the primitives out of which + * the lock package is built. The implementation is left + * to the machine-dependent code. + */ + +#ifdef notdef +/* + * A sample implementation of simple locks. + * assumes: + * boolean_t test_and_set(boolean_t *) + * indivisibly sets the boolean to TRUE + * and returns its old value + * and that setting a boolean to FALSE is indivisible. + */ +/* + * simple_lock_init initializes a simple lock. A simple lock + * may only be used for exclusive locks. + */ + +void simple_lock_init(l) + simple_lock_t l; +{ + *(boolean_t *)l = FALSE; +} + +void simple_lock(l) + simple_lock_t l; +{ + while (test_and_set((boolean_t *)l)) + continue; +} + +void simple_unlock(l) + simple_lock_t l; +{ + *(boolean_t *)l = FALSE; +} + +boolean_t simple_lock_try(l) + simple_lock_t l; +{ + return (!test_and_set((boolean_t *)l)); +} +#endif /* notdef */ +#endif /* NCPUS > 1 */ + +#if NCPUS > 1 +int lock_wait_time = 100; +#else /* NCPUS > 1 */ + + /* + * It is silly to spin on a uni-processor as if we + * thought something magical would happen to the + * want_write bit while we are executing. + */ +int lock_wait_time = 0; +#endif /* NCPUS > 1 */ + + +/* + * Routine: lock_init + * Function: + * Initialize a lock; required before use. + * Note that clients declare the "struct lock" + * variables and then initialize them, rather + * than getting a new one from this module. + */ +void lock_init(l, can_sleep) + lock_t l; + boolean_t can_sleep; +{ + bzero(l, sizeof(lock_data_t)); + simple_lock_init(&l->interlock); + l->want_write = FALSE; + l->want_upgrade = FALSE; + l->read_count = 0; + l->can_sleep = can_sleep; + l->thread = (char *)-1; /* XXX */ + l->recursion_depth = 0; +} + +void lock_sleepable(l, can_sleep) + lock_t l; + boolean_t can_sleep; +{ + simple_lock(&l->interlock); + l->can_sleep = can_sleep; + simple_unlock(&l->interlock); +} + + +/* + * Sleep locks. These use the same data structure and algorithm + * as the spin locks, but the process sleeps while it is waiting + * for the lock. These work on uniprocessor systems. + */ + +void lock_write(l) + register lock_t l; +{ + register int i; + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock. + */ + l->recursion_depth++; + simple_unlock(&l->interlock); + return; + } + + /* + * Try to acquire the want_write bit. + */ + while (l->want_write) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && l->want_write) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && l->want_write) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + l->want_write = TRUE; + + /* Wait for readers (and upgrades) to finish */ + + while ((l->read_count != 0) || l->want_upgrade) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && (l->read_count != 0 || + l->want_upgrade)) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && (l->read_count != 0 || l->want_upgrade)) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + simple_unlock(&l->interlock); +} + +void lock_done(l) + register lock_t l; +{ + simple_lock(&l->interlock); + + if (l->read_count != 0) + l->read_count--; + else + if (l->recursion_depth != 0) + l->recursion_depth--; + else + if (l->want_upgrade) + l->want_upgrade = FALSE; + else + l->want_write = FALSE; + + if (l->waiting) { + l->waiting = FALSE; + thread_wakeup((int) l); + } + simple_unlock(&l->interlock); +} + +void lock_read(l) + register lock_t l; +{ + register int i; + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock. + */ + l->read_count++; + simple_unlock(&l->interlock); + return; + } + + while (l->want_write || l->want_upgrade) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && (l->want_write || l->want_upgrade)) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && (l->want_write || l->want_upgrade)) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + + l->read_count++; + simple_unlock(&l->interlock); +} + +/* + * Routine: lock_read_to_write + * Function: + * Improves a read-only lock to one with + * write permission. If another reader has + * already requested an upgrade to a write lock, + * no lock is held upon return. + * + * Returns TRUE if the upgrade *failed*. + */ +boolean_t lock_read_to_write(l) + register lock_t l; +{ + register int i; + + simple_lock(&l->interlock); + + l->read_count--; + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock. + */ + l->recursion_depth++; + simple_unlock(&l->interlock); + return(FALSE); + } + + if (l->want_upgrade) { + /* + * Someone else has requested upgrade. + * Since we've released a read lock, wake + * him up. + */ + if (l->waiting) { + l->waiting = FALSE; + thread_wakeup((int) l); + } + + simple_unlock(&l->interlock); + return (TRUE); + } + + l->want_upgrade = TRUE; + + while (l->read_count != 0) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && l->read_count != 0) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && l->read_count != 0) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + + simple_unlock(&l->interlock); + return (FALSE); +} + +void lock_write_to_read(l) + register lock_t l; +{ + simple_lock(&l->interlock); + + l->read_count++; + if (l->recursion_depth != 0) + l->recursion_depth--; + else + if (l->want_upgrade) + l->want_upgrade = FALSE; + else + l->want_write = FALSE; + + if (l->waiting) { + l->waiting = FALSE; + thread_wakeup((int) l); + } + + simple_unlock(&l->interlock); +} + + +/* + * Routine: lock_try_write + * Function: + * Tries to get a write lock. + * + * Returns FALSE if the lock is not held on return. + */ + +boolean_t lock_try_write(l) + register lock_t l; +{ + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock + */ + l->recursion_depth++; + simple_unlock(&l->interlock); + return(TRUE); + } + + if (l->want_write || l->want_upgrade || l->read_count) { + /* + * Can't get lock. + */ + simple_unlock(&l->interlock); + return(FALSE); + } + + /* + * Have lock. + */ + + l->want_write = TRUE; + simple_unlock(&l->interlock); + return(TRUE); +} + +/* + * Routine: lock_try_read + * Function: + * Tries to get a read lock. + * + * Returns FALSE if the lock is not held on return. + */ + +boolean_t lock_try_read(l) + register lock_t l; +{ + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock + */ + l->read_count++; + simple_unlock(&l->interlock); + return(TRUE); + } + + if (l->want_write || l->want_upgrade) { + simple_unlock(&l->interlock); + return(FALSE); + } + + l->read_count++; + simple_unlock(&l->interlock); + return(TRUE); +} + +/* + * Routine: lock_try_read_to_write + * Function: + * Improves a read-only lock to one with + * write permission. If another reader has + * already requested an upgrade to a write lock, + * the read lock is still held upon return. + * + * Returns FALSE if the upgrade *failed*. + */ +boolean_t lock_try_read_to_write(l) + register lock_t l; +{ + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock + */ + l->read_count--; + l->recursion_depth++; + simple_unlock(&l->interlock); + return(TRUE); + } + + if (l->want_upgrade) { + simple_unlock(&l->interlock); + return(FALSE); + } + l->want_upgrade = TRUE; + l->read_count--; + + while (l->read_count != 0) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + + simple_unlock(&l->interlock); + return(TRUE); +} + +/* + * Allow a process that has a lock for write to acquire it + * recursively (for read, write, or update). + */ +void lock_set_recursive(l) + lock_t l; +{ + simple_lock(&l->interlock); + if (!l->want_write) { + panic("lock_set_recursive: don't have write lock"); + } + l->thread = (char *) current_thread(); + simple_unlock(&l->interlock); +} + +/* + * Prevent a lock from being re-acquired. + */ +void lock_clear_recursive(l) + lock_t l; +{ + simple_lock(&l->interlock); + if (((thread_t) l->thread) != current_thread()) { + panic("lock_clear_recursive: wrong thread"); + } + if (l->recursion_depth == 0) + l->thread = (char *)-1; /* XXX */ + simple_unlock(&l->interlock); +} diff --git a/sys/vm/lock.h b/sys/vm/lock.h new file mode 100644 index 0000000..26bed1f --- /dev/null +++ b/sys/vm/lock.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lock.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Locking primitives definitions + */ + +#ifndef _LOCK_H_ +#define _LOCK_H_ + +#define NCPUS 1 /* XXX */ + +/* + * A simple spin lock. + */ + +struct slock { + int lock_data; /* in general 1 bit is sufficient */ +}; + +typedef struct slock simple_lock_data_t; +typedef struct slock *simple_lock_t; + +/* + * The general lock structure. Provides for multiple readers, + * upgrading from read to write, and sleeping until the lock + * can be gained. + */ + +struct lock { +#ifdef vax + /* + * Efficient VAX implementation -- see field description below. + */ + unsigned int read_count:16, + want_upgrade:1, + want_write:1, + waiting:1, + can_sleep:1, + :0; + + simple_lock_data_t interlock; +#else /* vax */ +#ifdef ns32000 + /* + * Efficient ns32000 implementation -- + * see field description below. + */ + simple_lock_data_t interlock; + unsigned int read_count:16, + want_upgrade:1, + want_write:1, + waiting:1, + can_sleep:1, + :0; + +#else /* ns32000 */ + /* Only the "interlock" field is used for hardware exclusion; + * other fields are modified with normal instructions after + * acquiring the interlock bit. + */ + simple_lock_data_t + interlock; /* Interlock for remaining fields */ + boolean_t want_write; /* Writer is waiting, or locked for write */ + boolean_t want_upgrade; /* Read-to-write upgrade waiting */ + boolean_t waiting; /* Someone is sleeping on lock */ + boolean_t can_sleep; /* Can attempts to lock go to sleep */ + int read_count; /* Number of accepted readers */ +#endif /* ns32000 */ +#endif /* vax */ + char *thread; /* Thread that has lock, if recursive locking allowed */ + /* (should be thread_t, but but we then have mutually + recursive definitions) */ + int recursion_depth;/* Depth of recursion */ +}; + +typedef struct lock lock_data_t; +typedef struct lock *lock_t; + +#if NCPUS > 1 +__BEGIN_DECLS +void simple_lock __P((simple_lock_t)); +void simple_lock_init __P((simple_lock_t)); +boolean_t simple_lock_try __P((simple_lock_t)); +void simple_unlock __P((simple_lock_t)); +__END_DECLS +#else /* No multiprocessor locking is necessary. */ +#define simple_lock(l) +#define simple_lock_init(l) +#define simple_lock_try(l) (1) /* Always succeeds. */ +#define simple_unlock(l) +#endif + +/* Sleep locks must work even if no multiprocessing. */ + +#define lock_read_done(l) lock_done(l) +#define lock_write_done(l) lock_done(l) + +void lock_clear_recursive __P((lock_t)); +void lock_done __P((lock_t)); +void lock_init __P((lock_t, boolean_t)); +void lock_read __P((lock_t)); +boolean_t lock_read_to_write __P((lock_t)); +void lock_set_recursive __P((lock_t)); +void lock_sleepable __P((lock_t, boolean_t)); +boolean_t lock_try_read __P((lock_t)); +boolean_t lock_try_read_to_write __P((lock_t)); +boolean_t lock_try_write __P((lock_t)); +void lock_write __P((lock_t)); +void lock_write_to_read __P((lock_t)); +#endif /* !_LOCK_H_ */ diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h new file mode 100644 index 0000000..63a83c9 --- /dev/null +++ b/sys/vm/pmap.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pmap.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Avadis Tevanian, Jr. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Machine address mapping definitions -- machine-independent + * section. [For machine-dependent section, see "machine/pmap.h".] + */ + +#ifndef _PMAP_VM_ +#define _PMAP_VM_ + +/* + * Each machine dependent implementation is expected to + * keep certain statistics. They may do this anyway they + * so choose, but are expected to return the statistics + * in the following structure. + */ +struct pmap_statistics { + long resident_count; /* # of pages mapped (total)*/ + long wired_count; /* # of pages wired */ +}; +typedef struct pmap_statistics *pmap_statistics_t; + +#include <machine/pmap.h> + +#ifdef KERNEL +__BEGIN_DECLS +void *pmap_bootstrap_alloc __P((int)); +void pmap_bootstrap( /* machine dependent */ ); +void pmap_change_wiring __P((pmap_t, vm_offset_t, boolean_t)); +void pmap_clear_modify __P((vm_offset_t pa)); +void pmap_clear_reference __P((vm_offset_t pa)); +void pmap_collect __P((pmap_t)); +void pmap_copy __P((pmap_t, + pmap_t, vm_offset_t, vm_size_t, vm_offset_t)); +void pmap_copy_page __P((vm_offset_t, vm_offset_t)); +pmap_t pmap_create __P((vm_size_t)); +void pmap_destroy __P((pmap_t)); +void pmap_enter __P((pmap_t, + vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); +vm_offset_t pmap_extract __P((pmap_t, vm_offset_t)); +void pmap_init __P((vm_offset_t, vm_offset_t)); +boolean_t pmap_is_modified __P((vm_offset_t pa)); +boolean_t pmap_is_referenced __P((vm_offset_t pa)); +vm_offset_t pmap_map __P((vm_offset_t, vm_offset_t, vm_offset_t, int)); +void pmap_page_protect __P((vm_offset_t, vm_prot_t)); +void pmap_pageable __P((pmap_t, + vm_offset_t, vm_offset_t, boolean_t)); +vm_offset_t pmap_phys_address __P((int)); +void pmap_pinit __P((pmap_t)); +void pmap_protect __P((pmap_t, + vm_offset_t, vm_offset_t, vm_prot_t)); +void pmap_reference __P((pmap_t)); +void pmap_release __P((pmap_t)); +void pmap_remove __P((pmap_t, vm_offset_t, vm_offset_t)); +void pmap_update __P((void)); +void pmap_zero_page __P((vm_offset_t)); +__END_DECLS +#endif + +#endif /* _PMAP_VM_ */ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c new file mode 100644 index 0000000..a534d42 --- /dev/null +++ b/sys/vm/swap_pager.c @@ -0,0 +1,1833 @@ +/* + * Copyright (c) 1994 John S. Dyson + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ + * + * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 + */ + +/* + * Quick hack to page to dedicated partition(s). + * TODO: + * Add multiprocessor locks + * Deal with async writes in a better fashion + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/malloc.h> + +#include <miscfs/specfs/specdev.h> +#include <sys/rlist.h> + +#include <vm/vm.h> +#include <vm/vm_pager.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/swap_pager.h> + +#ifndef NPENDINGIO +#define NPENDINGIO 16 +#endif + +extern int nswbuf; +int nswiodone; +extern int vm_pageout_rate_limit; +static int cleandone; +extern int hz; +int swap_pager_full; +extern vm_map_t pager_map; +extern int vm_pageout_pages_needed; +extern int vm_swap_size; +extern struct vnode *swapdev_vp; + +#define MAX_PAGEOUT_CLUSTER 8 + +TAILQ_HEAD(swpclean, swpagerclean); + +typedef struct swpagerclean *swp_clean_t; + +struct swpagerclean { + TAILQ_ENTRY(swpagerclean) spc_list; + int spc_flags; + struct buf *spc_bp; + sw_pager_t spc_swp; + vm_offset_t spc_kva; + vm_offset_t spc_altkva; + int spc_count; + vm_page_t spc_m[MAX_PAGEOUT_CLUSTER]; +} swcleanlist [NPENDINGIO] ; + + +extern vm_map_t kernel_map; + +/* spc_flags values */ +#define SPC_ERROR 0x01 + +#define SWB_EMPTY (-1) + +void swap_pager_init(void); +vm_pager_t swap_pager_alloc(caddr_t, vm_size_t, vm_prot_t, vm_offset_t); +void swap_pager_dealloc(vm_pager_t); +boolean_t swap_pager_getpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_putpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_getmulti(vm_pager_t, vm_page_t *, int, int, boolean_t); +boolean_t swap_pager_haspage(vm_pager_t, vm_offset_t); +int swap_pager_io(sw_pager_t, vm_page_t *, int, int, int); +void swap_pager_iodone(struct buf *); +boolean_t swap_pager_clean(); + +extern struct pagerops swappagerops; + +struct swpclean swap_pager_done; /* list of compileted page cleans */ +struct swpclean swap_pager_inuse; /* list of pending page cleans */ +struct swpclean swap_pager_free; /* list of free pager clean structs */ +struct pagerlst swap_pager_list; /* list of "named" anon regions */ +struct pagerlst swap_pager_un_list; /* list of "unnamed" anon pagers */ + +#define SWAP_FREE_NEEDED 0x1 /* need a swap block */ +int swap_pager_needflags; +struct rlist *swapfrag; + +struct pagerlst *swp_qs[]={ + &swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0 +}; + +int swap_pager_putmulti(); + +struct pagerops swappagerops = { + swap_pager_init, + swap_pager_alloc, + swap_pager_dealloc, + swap_pager_getpage, + swap_pager_getmulti, + swap_pager_putpage, + swap_pager_putmulti, + swap_pager_haspage +}; + +extern int nswbuf; + +int npendingio = NPENDINGIO; +int pendingiowait; +int require_swap_init; +void swap_pager_finish(); +int dmmin, dmmax; +extern int vm_page_count; + +struct buf * getpbuf() ; +void relpbuf(struct buf *bp) ; + +static inline void swapsizecheck() { + if( vm_swap_size < 128*btodb(PAGE_SIZE)) { + if( swap_pager_full) + printf("swap_pager: out of space\n"); + swap_pager_full = 1; + } else if( vm_swap_size > 192*btodb(PAGE_SIZE)) + swap_pager_full = 0; +} + +void +swap_pager_init() +{ + extern int dmmin, dmmax; + + dfltpagerops = &swappagerops; + + TAILQ_INIT(&swap_pager_list); + TAILQ_INIT(&swap_pager_un_list); + + /* + * Initialize clean lists + */ + TAILQ_INIT(&swap_pager_inuse); + TAILQ_INIT(&swap_pager_done); + TAILQ_INIT(&swap_pager_free); + + require_swap_init = 1; + + /* + * Calculate the swap allocation constants. + */ + + dmmin = CLBYTES/DEV_BSIZE; + dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2; + +} + +/* + * Allocate a pager structure and associated resources. + * Note that if we are called from the pageout daemon (handle == NULL) + * we should not wait for memory as it could resulting in deadlock. + */ +vm_pager_t +swap_pager_alloc(handle, size, prot, offset) + caddr_t handle; + register vm_size_t size; + vm_prot_t prot; + vm_offset_t offset; +{ + register vm_pager_t pager; + register sw_pager_t swp; + int waitok; + int i,j; + + if (require_swap_init) { + swp_clean_t spc; + struct buf *bp; + /* + * kva's are allocated here so that we dont need to keep + * doing kmem_alloc pageables at runtime + */ + for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) { + spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE); + if (!spc->spc_kva) { + break; + } + spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT); + if (!spc->spc_bp) { + kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE); + break; + } + spc->spc_flags = 0; + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + } + require_swap_init = 0; + if( size == 0) + return(NULL); + } + + /* + * If this is a "named" anonymous region, look it up and + * return the appropriate pager if it exists. + */ + if (handle) { + pager = vm_pager_lookup(&swap_pager_list, handle); + if (pager != NULL) { + /* + * Use vm_object_lookup to gain a reference + * to the object and also to remove from the + * object cache. + */ + if (vm_object_lookup(pager) == NULL) + panic("swap_pager_alloc: bad object"); + return(pager); + } + } + + if (swap_pager_full) { + return(NULL); + } + + /* + * Pager doesn't exist, allocate swap management resources + * and initialize. + */ + waitok = handle ? M_WAITOK : M_NOWAIT; + pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok); + if (pager == NULL) + return(NULL); + swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok); + if (swp == NULL) { + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + size = round_page(size); + swp->sw_osize = size; + swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE); + swp->sw_blocks = (sw_blk_t) + malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks), + M_VMPGDATA, waitok); + if (swp->sw_blocks == NULL) { + free((caddr_t)swp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + + for (i = 0; i < swp->sw_nblocks; i++) { + swp->sw_blocks[i].swb_valid = 0; + swp->sw_blocks[i].swb_locked = 0; + for (j = 0; j < SWB_NPAGES; j++) + swp->sw_blocks[i].swb_block[j] = SWB_EMPTY; + } + + swp->sw_poip = 0; + if (handle) { + vm_object_t object; + + swp->sw_flags = SW_NAMED; + TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list); + /* + * Consistant with other pagers: return with object + * referenced. Can't do this with handle == NULL + * since it might be the pageout daemon calling. + */ + object = vm_object_allocate(size); + vm_object_enter(object, pager); + vm_object_setpager(object, pager, 0, FALSE); + } else { + swp->sw_flags = 0; + TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list); + } + pager->pg_handle = handle; + pager->pg_ops = &swappagerops; + pager->pg_type = PG_SWAP; + pager->pg_data = (caddr_t)swp; + + return(pager); +} + +/* + * returns disk block associated with pager and offset + * additionally, as a side effect returns a flag indicating + * if the block has been written + */ + +static int * +swap_pager_diskaddr(swp, offset, valid) + sw_pager_t swp; + vm_offset_t offset; + int *valid; +{ + register sw_blk_t swb; + int ix; + + if (valid) + *valid = 0; + ix = offset / (SWB_NPAGES*PAGE_SIZE); + if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { + return(FALSE); + } + swb = &swp->sw_blocks[ix]; + ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; + if (valid) + *valid = swb->swb_valid & (1<<ix); + return &swb->swb_block[ix]; +} + +/* + * Utility routine to set the valid (written) bit for + * a block associated with a pager and offset + */ +static void +swap_pager_setvalid(swp, offset, valid) + sw_pager_t swp; + vm_offset_t offset; + int valid; +{ + register sw_blk_t swb; + int ix; + + ix = offset / (SWB_NPAGES*PAGE_SIZE); + if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) + return; + + swb = &swp->sw_blocks[ix]; + ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; + if (valid) + swb->swb_valid |= (1 << ix); + else + swb->swb_valid &= ~(1 << ix); + return; +} + +/* + * this routine allocates swap space with a fragmentation + * minimization policy. + */ +int +swap_pager_getswapspace( unsigned amount, unsigned *rtval) { + unsigned tmpalloc; + unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE); + if( amount < nblocksfrag) { + if( rlist_alloc(&swapfrag, amount, rtval)) + return 1; + if( !rlist_alloc(&swapmap, nblocksfrag, &tmpalloc)) + return 0; + rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1); + *rtval = tmpalloc; + return 1; + } + if( !rlist_alloc(&swapmap, amount, rtval)) + return 0; + else + return 1; +} + +/* + * this routine frees swap space with a fragmentation + * minimization policy. + */ +void +swap_pager_freeswapspace( unsigned from, unsigned to) { + unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE); + unsigned tmpalloc; + if( ((to + 1) - from) >= nblocksfrag) { + while( (from + nblocksfrag) <= to + 1) { + rlist_free(&swapmap, from, from + nblocksfrag - 1); + from += nblocksfrag; + } + } + if( from >= to) + return; + rlist_free(&swapfrag, from, to); + while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) { + rlist_free(&swapmap, tmpalloc, tmpalloc + nblocksfrag-1); + } +} +/* + * this routine frees swap blocks from a specified pager + */ +void +_swap_pager_freespace(swp, start, size) + sw_pager_t swp; + vm_offset_t start; + vm_offset_t size; +{ + vm_offset_t i; + int s; + + s = splbio(); + for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) { + int valid; + int *addr = swap_pager_diskaddr(swp, i, &valid); + if (addr && *addr != SWB_EMPTY) { + swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1); + if( valid) { + vm_swap_size += btodb(PAGE_SIZE); + swap_pager_setvalid(swp, i, 0); + } + *addr = SWB_EMPTY; + } + } + swapsizecheck(); + splx(s); +} + +void +swap_pager_freespace(pager, start, size) + vm_pager_t pager; + vm_offset_t start; + vm_offset_t size; +{ + _swap_pager_freespace((sw_pager_t) pager->pg_data, start, size); +} + +/* + * swap_pager_reclaim frees up over-allocated space from all pagers + * this eliminates internal fragmentation due to allocation of space + * for segments that are never swapped to. It has been written so that + * it does not block until the rlist_free operation occurs; it keeps + * the queues consistant. + */ + +/* + * Maximum number of blocks (pages) to reclaim per pass + */ +#define MAXRECLAIM 256 + +void +swap_pager_reclaim() +{ + vm_pager_t p; + sw_pager_t swp; + int i, j, k; + int s; + int reclaimcount; + static int reclaims[MAXRECLAIM]; + static int in_reclaim; + +/* + * allow only one process to be in the swap_pager_reclaim subroutine + */ + s = splbio(); + if (in_reclaim) { + tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0); + splx(s); + return; + } + in_reclaim = 1; + reclaimcount = 0; + + /* for each pager queue */ + for (k = 0; swp_qs[k]; k++) { + + p = swp_qs[k]->tqh_first; + while (p && (reclaimcount < MAXRECLAIM)) { + + /* + * see if any blocks associated with a pager has been + * allocated but not used (written) + */ + swp = (sw_pager_t) p->pg_data; + for (i = 0; i < swp->sw_nblocks; i++) { + sw_blk_t swb = &swp->sw_blocks[i]; + if( swb->swb_locked) + continue; + for (j = 0; j < SWB_NPAGES; j++) { + if (swb->swb_block[j] != SWB_EMPTY && + (swb->swb_valid & (1 << j)) == 0) { + reclaims[reclaimcount++] = swb->swb_block[j]; + swb->swb_block[j] = SWB_EMPTY; + if (reclaimcount >= MAXRECLAIM) + goto rfinished; + } + } + } + p = p->pg_list.tqe_next; + } + } + +rfinished: + +/* + * free the blocks that have been added to the reclaim list + */ + for (i = 0; i < reclaimcount; i++) { + swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1); + swapsizecheck(); + wakeup((caddr_t) &in_reclaim); + } + + splx(s); + in_reclaim = 0; + wakeup((caddr_t) &in_reclaim); +} + + +/* + * swap_pager_copy copies blocks from one pager to another and + * destroys the source pager + */ + +void +swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset) + vm_pager_t srcpager; + vm_offset_t srcoffset; + vm_pager_t dstpager; + vm_offset_t dstoffset; + vm_offset_t offset; +{ + sw_pager_t srcswp, dstswp; + vm_offset_t i; + int s; + + srcswp = (sw_pager_t) srcpager->pg_data; + dstswp = (sw_pager_t) dstpager->pg_data; + +/* + * remove the source pager from the swap_pager internal queue + */ + s = splbio(); + if (srcswp->sw_flags & SW_NAMED) { + TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list); + srcswp->sw_flags &= ~SW_NAMED; + } else { + TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list); + } + + while (srcswp->sw_poip) { + tsleep((caddr_t)srcswp, PVM, "spgout", 0); + } + splx(s); + +/* + * clean all of the pages that are currently active and finished + */ + (void) swap_pager_clean(); + + s = splbio(); +/* + * clear source block before destination object + * (release allocated space) + */ + for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) { + int valid; + int *addr = swap_pager_diskaddr(srcswp, i, &valid); + if (addr && *addr != SWB_EMPTY) { + swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1); + if( valid) + vm_swap_size += btodb(PAGE_SIZE); + swapsizecheck(); + *addr = SWB_EMPTY; + } + } +/* + * transfer source to destination + */ + for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) { + int srcvalid, dstvalid; + int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset, + &srcvalid); + int *dstaddrp; + /* + * see if the source has space allocated + */ + if (srcaddrp && *srcaddrp != SWB_EMPTY) { + /* + * if the source is valid and the dest has no space, then + * copy the allocation from the srouce to the dest. + */ + if (srcvalid) { + dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid); + /* + * if the dest already has a valid block, deallocate the + * source block without copying. + */ + if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) { + swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1); + *dstaddrp = SWB_EMPTY; + } + if (dstaddrp && *dstaddrp == SWB_EMPTY) { + *dstaddrp = *srcaddrp; + *srcaddrp = SWB_EMPTY; + swap_pager_setvalid(dstswp, i + dstoffset, 1); + vm_swap_size -= btodb(PAGE_SIZE); + } + } + /* + * if the source is not empty at this point, then deallocate the space. + */ + if (*srcaddrp != SWB_EMPTY) { + swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1); + if( srcvalid) + vm_swap_size += btodb(PAGE_SIZE); + *srcaddrp = SWB_EMPTY; + } + } + } + +/* + * deallocate the rest of the source object + */ + for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) { + int valid; + int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid); + if (srcaddrp && *srcaddrp != SWB_EMPTY) { + swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1); + if( valid) + vm_swap_size += btodb(PAGE_SIZE); + *srcaddrp = SWB_EMPTY; + } + } + + swapsizecheck(); + splx(s); + + free((caddr_t)srcswp->sw_blocks, M_VMPGDATA); + srcswp->sw_blocks = 0; + free((caddr_t)srcswp, M_VMPGDATA); + srcpager->pg_data = 0; + free((caddr_t)srcpager, M_VMPAGER); + + return; +} + + +void +swap_pager_dealloc(pager) + vm_pager_t pager; +{ + register int i,j; + register sw_blk_t bp; + register sw_pager_t swp; + int s; + + /* + * Remove from list right away so lookups will fail if we + * block for pageout completion. + */ + s = splbio(); + swp = (sw_pager_t) pager->pg_data; + if (swp->sw_flags & SW_NAMED) { + TAILQ_REMOVE(&swap_pager_list, pager, pg_list); + swp->sw_flags &= ~SW_NAMED; + } else { + TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list); + } + /* + * Wait for all pageouts to finish and remove + * all entries from cleaning list. + */ + + while (swp->sw_poip) { + tsleep((caddr_t)swp, PVM, "swpout", 0); + } + splx(s); + + + (void) swap_pager_clean(); + + /* + * Free left over swap blocks + */ + s = splbio(); + for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) { + for (j = 0; j < SWB_NPAGES; j++) + if (bp->swb_block[j] != SWB_EMPTY) { + swap_pager_freeswapspace((unsigned)bp->swb_block[j], + (unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1); + if( bp->swb_valid & (1<<j)) + vm_swap_size += btodb(PAGE_SIZE); + bp->swb_block[j] = SWB_EMPTY; + } + } + splx(s); + swapsizecheck(); + + /* + * Free swap management resources + */ + free((caddr_t)swp->sw_blocks, M_VMPGDATA); + swp->sw_blocks = 0; + free((caddr_t)swp, M_VMPGDATA); + pager->pg_data = 0; + free((caddr_t)pager, M_VMPAGER); +} + +/* + * swap_pager_getmulti can get multiple pages. + */ +int +swap_pager_getmulti(pager, m, count, reqpage, sync) + vm_pager_t pager; + vm_page_t *m; + int count; + int reqpage; + boolean_t sync; +{ + if( reqpage >= count) + panic("swap_pager_getmulti: reqpage >= count\n"); + return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage); +} + +/* + * swap_pager_getpage gets individual pages + */ +int +swap_pager_getpage(pager, m, sync) + vm_pager_t pager; + vm_page_t m; + boolean_t sync; +{ + vm_page_t marray[1]; + + marray[0] = m; + return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0); +} + +int +swap_pager_putmulti(pager, m, c, sync, rtvals) + vm_pager_t pager; + vm_page_t *m; + int c; + boolean_t sync; + int *rtvals; +{ + int flags; + + if (pager == NULL) { + (void) swap_pager_clean(); + return VM_PAGER_OK; + } + + flags = B_WRITE; + if (!sync) + flags |= B_ASYNC; + + return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals); +} + +/* + * swap_pager_putpage writes individual pages + */ +int +swap_pager_putpage(pager, m, sync) + vm_pager_t pager; + vm_page_t m; + boolean_t sync; +{ + int flags; + vm_page_t marray[1]; + int rtvals[1]; + + + if (pager == NULL) { + (void) swap_pager_clean(); + return VM_PAGER_OK; + } + + marray[0] = m; + flags = B_WRITE; + if (!sync) + flags |= B_ASYNC; + + swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals); + + return rtvals[0]; +} + +static inline int +const swap_pager_block_index(swp, offset) + sw_pager_t swp; + vm_offset_t offset; +{ + return (offset / (SWB_NPAGES*PAGE_SIZE)); +} + +static inline int +const swap_pager_block_offset(swp, offset) + sw_pager_t swp; + vm_offset_t offset; +{ + return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE); +} + +/* + * _swap_pager_haspage returns TRUE if the pager has data that has + * been written out. + */ +static boolean_t +_swap_pager_haspage(swp, offset) + sw_pager_t swp; + vm_offset_t offset; +{ + register sw_blk_t swb; + int ix; + + ix = offset / (SWB_NPAGES*PAGE_SIZE); + if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { + return(FALSE); + } + swb = &swp->sw_blocks[ix]; + ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; + if (swb->swb_block[ix] != SWB_EMPTY) { + if (swb->swb_valid & (1 << ix)) + return TRUE; + } + + return(FALSE); +} + +/* + * swap_pager_haspage is the externally accessible version of + * _swap_pager_haspage above. this routine takes a vm_pager_t + * for an argument instead of sw_pager_t. + */ +boolean_t +swap_pager_haspage(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ + return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset); +} + +/* + * swap_pager_freepage is a convienience routine that clears the busy + * bit and deallocates a page. + */ +static void +swap_pager_freepage(m) + vm_page_t m; +{ + PAGE_WAKEUP(m); + vm_page_free(m); +} + +/* + * swap_pager_ridpages is a convienience routine that deallocates all + * but the required page. this is usually used in error returns that + * need to invalidate the "extra" readahead pages. + */ +static void +swap_pager_ridpages(m, count, reqpage) + vm_page_t *m; + int count; + int reqpage; +{ + int i; + for (i = 0; i < count; i++) + if (i != reqpage) + swap_pager_freepage(m[i]); +} + +int swapwritecount=0; + +/* + * swap_pager_iodone1 is the completion routine for both reads and async writes + */ +void +swap_pager_iodone1(bp) + struct buf *bp; +{ + bp->b_flags |= B_DONE; + bp->b_flags &= ~B_ASYNC; + wakeup((caddr_t)bp); +/* + if ((bp->b_flags & B_READ) == 0) + vwakeup(bp); +*/ +} + + +int +swap_pager_input(swp, m, count, reqpage) + register sw_pager_t swp; + vm_page_t *m; + int count, reqpage; +{ + register struct buf *bp; + sw_blk_t swb[count]; + register int s; + int i; + boolean_t rv; + vm_offset_t kva, off[count]; + swp_clean_t spc; + vm_offset_t paging_offset; + vm_object_t object; + int reqaddr[count]; + + int first, last; + int failed; + int reqdskregion; + + object = m[reqpage]->object; + paging_offset = object->paging_offset; + /* + * First determine if the page exists in the pager if this is + * a sync read. This quickly handles cases where we are + * following shadow chains looking for the top level object + * with the page. + */ + if (swp->sw_blocks == NULL) { + swap_pager_ridpages(m, count, reqpage); + return(VM_PAGER_FAIL); + } + + for(i = 0; i < count; i++) { + vm_offset_t foff = m[i]->offset + paging_offset; + int ix = swap_pager_block_index(swp, foff); + if (ix >= swp->sw_nblocks) { + int j; + if( i <= reqpage) { + swap_pager_ridpages(m, count, reqpage); + return(VM_PAGER_FAIL); + } + for(j = i; j < count; j++) { + swap_pager_freepage(m[j]); + } + count = i; + break; + } + + swb[i] = &swp->sw_blocks[ix]; + off[i] = swap_pager_block_offset(swp, foff); + reqaddr[i] = swb[i]->swb_block[off[i]]; + } + + /* make sure that our required input request is existant */ + + if (reqaddr[reqpage] == SWB_EMPTY || + (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) { + swap_pager_ridpages(m, count, reqpage); + return(VM_PAGER_FAIL); + } + + + reqdskregion = reqaddr[reqpage] / dmmax; + + /* + * search backwards for the first contiguous page to transfer + */ + failed = 0; + first = 0; + for (i = reqpage - 1; i >= 0; --i) { + if ( failed || (reqaddr[i] == SWB_EMPTY) || + (swb[i]->swb_valid & (1 << off[i])) == 0 || + (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || + ((reqaddr[i] / dmmax) != reqdskregion)) { + failed = 1; + swap_pager_freepage(m[i]); + if (first == 0) + first = i + 1; + } + } + /* + * search forwards for the last contiguous page to transfer + */ + failed = 0; + last = count; + for (i = reqpage + 1; i < count; i++) { + if ( failed || (reqaddr[i] == SWB_EMPTY) || + (swb[i]->swb_valid & (1 << off[i])) == 0 || + (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || + ((reqaddr[i] / dmmax) != reqdskregion)) { + failed = 1; + swap_pager_freepage(m[i]); + if (last == count) + last = i; + } + } + + count = last; + if (first != 0) { + for (i = first; i < count; i++) { + m[i-first] = m[i]; + reqaddr[i-first] = reqaddr[i]; + off[i-first] = off[i]; + } + count -= first; + reqpage -= first; + } + + ++swb[reqpage]->swb_locked; + + /* + * at this point: + * "m" is a pointer to the array of vm_page_t for paging I/O + * "count" is the number of vm_page_t entries represented by "m" + * "object" is the vm_object_t for I/O + * "reqpage" is the index into "m" for the page actually faulted + */ + + spc = NULL; /* we might not use an spc data structure */ + kva = 0; + + /* + * we allocate a new kva for transfers > 1 page + * but for transfers == 1 page, the swap_pager_free list contains + * entries that have pre-allocated kva's (for efficiency). + */ + if (count > 1) { + kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE); + } + + + if (!kva) { + /* + * if a kva has not been allocated, we can only do a one page transfer, + * so we free the other pages that might have been allocated by + * vm_fault. + */ + swap_pager_ridpages(m, count, reqpage); + m[0] = m[reqpage]; + reqaddr[0] = reqaddr[reqpage]; + + count = 1; + reqpage = 0; + /* + * get a swap pager clean data structure, block until we get it + */ + if (swap_pager_free.tqh_first == NULL) { + s = splbio(); + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + while (swap_pager_free.tqh_first == NULL) { + swap_pager_needflags |= SWAP_FREE_NEEDED; + tsleep((caddr_t)&swap_pager_free, + PVM, "swpfre", 0); + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + } + splx(s); + } + spc = swap_pager_free.tqh_first; + TAILQ_REMOVE(&swap_pager_free, spc, spc_list); + kva = spc->spc_kva; + } + + + /* + * map our page(s) into kva for input + */ + for (i = 0; i < count; i++) { + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + } + pmap_update(); + + + /* + * Get a swap buffer header and perform the IO + */ + if( spc) { + bp = spc->spc_bp; + bzero(bp, sizeof *bp); + bp->b_spc = spc; + } else { + bp = getpbuf(); + } + + s = splbio(); + bp->b_flags = B_BUSY | B_READ | B_CALL; + bp->b_iodone = swap_pager_iodone1; + bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + crhold(bp->b_rcred); + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = reqaddr[0]; + bp->b_bcount = PAGE_SIZE*count; + bp->b_bufsize = PAGE_SIZE*count; + +/* + VHOLD(swapdev_vp); + bp->b_vp = swapdev_vp; + if (swapdev_vp->v_type == VBLK) + bp->b_dev = swapdev_vp->v_rdev; +*/ + bgetvp( swapdev_vp, bp); + + swp->sw_piip++; + + /* + * perform the I/O + */ + VOP_STRATEGY(bp); + + /* + * wait for the sync I/O to complete + */ + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "swread", 0); + } + rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK; + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE); + + --swp->sw_piip; + if (swp->sw_piip == 0) + wakeup((caddr_t) swp); + + /* + * relpbuf does this, but we maintain our own buffer + * list also... + */ + if (bp->b_vp) + brelvp(bp); + + splx(s); + --swb[reqpage]->swb_locked; + + /* + * remove the mapping for kernel virtual + */ + pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE); + + if (spc) { + /* + * if we have used an spc, we need to free it. + */ + if( bp->b_rcred != NOCRED) + crfree(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crfree(bp->b_wcred); + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + if (swap_pager_needflags & SWAP_FREE_NEEDED) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); + } + } else { + /* + * free the kernel virtual addresses + */ + kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE); + /* + * release the physical I/O buffer + */ + relpbuf(bp); + /* + * finish up input if everything is ok + */ + if( rv == VM_PAGER_OK) { + for (i = 0; i < count; i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + if (i != reqpage) { + /* + * whether or not to leave the page activated + * is up in the air, but we should put the page + * on a page queue somewhere. (it already is in + * the object). + * After some emperical results, it is best + * to deactivate the readahead pages. + */ + vm_page_deactivate(m[i]); + + /* + * just in case someone was asking for this + * page we now tell them that it is ok to use + */ + m[i]->flags &= ~PG_FAKE; + PAGE_WAKEUP(m[i]); + } + } + if( swap_pager_full) { + _swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE); + } + } else { + swap_pager_ridpages(m, count, reqpage); + } + } + return(rv); +} + +int +swap_pager_output(swp, m, count, flags, rtvals) + register sw_pager_t swp; + vm_page_t *m; + int count; + int flags; + int *rtvals; +{ + register struct buf *bp; + sw_blk_t swb[count]; + register int s; + int i, j, ix; + boolean_t rv; + vm_offset_t kva, off, foff; + swp_clean_t spc; + vm_offset_t paging_offset; + vm_object_t object; + int reqaddr[count]; + int failed; + +/* + if( count > 1) + printf("off: 0x%x, count: %d\n", m[0]->offset, count); +*/ + spc = NULL; + + object = m[0]->object; + paging_offset = object->paging_offset; + + failed = 0; + for(j=0;j<count;j++) { + foff = m[j]->offset + paging_offset; + ix = swap_pager_block_index(swp, foff); + swb[j] = 0; + if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { + rtvals[j] = VM_PAGER_FAIL; + failed = 1; + continue; + } else { + rtvals[j] = VM_PAGER_OK; + } + swb[j] = &swp->sw_blocks[ix]; + ++swb[j]->swb_locked; + if( failed) { + rtvals[j] = VM_PAGER_FAIL; + continue; + } + off = swap_pager_block_offset(swp, foff); + reqaddr[j] = swb[j]->swb_block[off]; + if( reqaddr[j] == SWB_EMPTY) { + int blk; + int tries; + int ntoget; + tries = 0; + s = splbio(); + + /* + * if any other pages have been allocated in this block, we + * only try to get one page. + */ + for (i = 0; i < SWB_NPAGES; i++) { + if (swb[j]->swb_block[i] != SWB_EMPTY) + break; + } + + + ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1; + /* + * this code is alittle conservative, but works + * (the intent of this code is to allocate small chunks + * for small objects) + */ + if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) { + ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE; + } + +retrygetspace: + if (!swap_pager_full && ntoget > 1 && + swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) { + + for (i = 0; i < ntoget; i++) { + swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i; + swb[j]->swb_valid = 0; + } + + reqaddr[j] = swb[j]->swb_block[off]; + } else if (!swap_pager_getswapspace(btodb(PAGE_SIZE), + &swb[j]->swb_block[off])) { + /* + * if the allocation has failed, we try to reclaim space and + * retry. + */ + if (++tries == 1) { + swap_pager_reclaim(); + goto retrygetspace; + } + rtvals[j] = VM_PAGER_AGAIN; + failed = 1; + } else { + reqaddr[j] = swb[j]->swb_block[off]; + swb[j]->swb_valid &= ~(1<<off); + } + splx(s); + } + } + + /* + * search forwards for the last contiguous page to transfer + */ + failed = 0; + for (i = 0; i < count; i++) { + if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) || + (reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) || + (rtvals[i] != VM_PAGER_OK)) { + failed = 1; + if( rtvals[i] == VM_PAGER_OK) + rtvals[i] = VM_PAGER_AGAIN; + } + } + + for(i = 0; i < count; i++) { + if( rtvals[i] != VM_PAGER_OK) { + if( swb[i]) + --swb[i]->swb_locked; + } + } + + for(i = 0; i < count; i++) + if( rtvals[i] != VM_PAGER_OK) + break; + + if( i == 0) { + return VM_PAGER_AGAIN; + } + + count = i; + for(i=0;i<count;i++) { + if( reqaddr[i] == SWB_EMPTY) + printf("I/O to empty block????\n"); + } + + /* + */ + + /* + * For synchronous writes, we clean up + * all completed async pageouts. + */ + if ((flags & B_ASYNC) == 0) { + swap_pager_clean(); + } + + kva = 0; + + /* + * we allocate a new kva for transfers > 1 page + * but for transfers == 1 page, the swap_pager_free list contains + * entries that have pre-allocated kva's (for efficiency). + */ + if ( count > 1) { + kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE); + if( !kva) { + for (i = 0; i < count; i++) { + if( swb[i]) + --swb[i]->swb_locked; + rtvals[i] = VM_PAGER_AGAIN; + } + return VM_PAGER_AGAIN; + } + } + + /* + * get a swap pager clean data structure, block until we get it + */ + if (swap_pager_free.tqh_first == NULL) { +/* + if (flags & B_ASYNC) { + for(i=0;i<count;i++) { + rtvals[i] = VM_PAGER_AGAIN; + if( swb[i]) + --swb[i]->swb_locked; + } + return VM_PAGER_AGAIN; + } +*/ + + s = splbio(); + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + while (swap_pager_free.tqh_first == NULL) { + swap_pager_needflags |= SWAP_FREE_NEEDED; + tsleep((caddr_t)&swap_pager_free, + PVM, "swpfre", 0); + if( curproc == pageproc) + (void) swap_pager_clean(); + else + wakeup((caddr_t) &vm_pages_needed); + } + splx(s); + } + + spc = swap_pager_free.tqh_first; + TAILQ_REMOVE(&swap_pager_free, spc, spc_list); + if( !kva) { + kva = spc->spc_kva; + spc->spc_altkva = 0; + } else { + spc->spc_altkva = kva; + } + + /* + * map our page(s) into kva for I/O + */ + for (i = 0; i < count; i++) { + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + } + pmap_update(); + + /* + * get the base I/O offset into the swap file + */ + for(i=0;i<count;i++) { + foff = m[i]->offset + paging_offset; + off = swap_pager_block_offset(swp, foff); + /* + * if we are setting the valid bit anew, + * then diminish the swap free space + */ + if( (swb[i]->swb_valid & (1 << off)) == 0) + vm_swap_size -= btodb(PAGE_SIZE); + + /* + * set the valid bit + */ + swb[i]->swb_valid |= (1 << off); + /* + * and unlock the data structure + */ + --swb[i]->swb_locked; + } + + s = splbio(); + /* + * Get a swap buffer header and perform the IO + */ + bp = spc->spc_bp; + bzero(bp, sizeof *bp); + bp->b_spc = spc; + + bp->b_flags = B_BUSY; + bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + crhold(bp->b_rcred); + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = reqaddr[0]; + bgetvp( swapdev_vp, bp); +/* + VHOLD(swapdev_vp); + bp->b_vp = swapdev_vp; + if (swapdev_vp->v_type == VBLK) + bp->b_dev = swapdev_vp->v_rdev; +*/ + bp->b_bcount = PAGE_SIZE*count; + bp->b_bufsize = PAGE_SIZE*count; + swapdev_vp->v_numoutput++; + + /* + * If this is an async write we set up additional buffer fields + * and place a "cleaning" entry on the inuse queue. + */ + if ( flags & B_ASYNC ) { + spc->spc_flags = 0; + spc->spc_swp = swp; + for(i=0;i<count;i++) + spc->spc_m[i] = m[i]; + spc->spc_count = count; + /* + * the completion routine for async writes + */ + bp->b_flags |= B_CALL; + bp->b_iodone = swap_pager_iodone; + bp->b_dirtyoff = 0; + bp->b_dirtyend = bp->b_bcount; + swp->sw_poip++; + TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list); + } else { + swp->sw_poip++; + bp->b_flags |= B_CALL; + bp->b_iodone = swap_pager_iodone1; + } + /* + * perform the I/O + */ + VOP_STRATEGY(bp); + if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) { + if ((bp->b_flags & B_DONE) == B_DONE) { + swap_pager_clean(); + } + splx(s); + for(i=0;i<count;i++) { + rtvals[i] = VM_PAGER_PEND; + } + return VM_PAGER_PEND; + } + + /* + * wait for the sync I/O to complete + */ + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "swwrt", 0); + } + rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK; + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE); + + --swp->sw_poip; + if (swp->sw_poip == 0) + wakeup((caddr_t) swp); + + if (bp->b_vp) + brelvp(bp); + + splx(s); + + /* + * remove the mapping for kernel virtual + */ + pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE); + + /* + * if we have written the page, then indicate that the page + * is clean. + */ + if (rv == VM_PAGER_OK) { + for(i=0;i<count;i++) { + if( rtvals[i] == VM_PAGER_OK) { + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + /* + * optimization, if a page has been read during the + * pageout process, we activate it. + */ + if ( (m[i]->flags & PG_ACTIVE) == 0 && + pmap_is_referenced(VM_PAGE_TO_PHYS(m[i]))) + vm_page_activate(m[i]); + } + } + } else { + for(i=0;i<count;i++) { + rtvals[i] = rv; + m[i]->flags |= PG_LAUNDRY; + } + } + + if( spc->spc_altkva) + kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE); + + if( bp->b_rcred != NOCRED) + crfree(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crfree(bp->b_wcred); + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + if (swap_pager_needflags & SWAP_FREE_NEEDED) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); + } + + return(rv); +} + +boolean_t +swap_pager_clean() +{ + register swp_clean_t spc, tspc; + register int s; + + tspc = NULL; + if (swap_pager_done.tqh_first == NULL) + return FALSE; + for (;;) { + s = splbio(); + /* + * Look up and removal from done list must be done + * at splbio() to avoid conflicts with swap_pager_iodone. + */ + while (spc = swap_pager_done.tqh_first) { + if( spc->spc_altkva) { + pmap_remove(vm_map_pmap(pager_map), spc->spc_altkva, spc->spc_altkva + spc->spc_count * PAGE_SIZE); + kmem_free_wakeup(pager_map, spc->spc_altkva, spc->spc_count * PAGE_SIZE); + spc->spc_altkva = 0; + } else { + pmap_remove(vm_map_pmap(pager_map), spc->spc_kva, spc->spc_kva + PAGE_SIZE); + } + swap_pager_finish(spc); + TAILQ_REMOVE(&swap_pager_done, spc, spc_list); + goto doclean; + } + + /* + * No operations done, thats all we can do for now. + */ + + splx(s); + break; + + /* + * The desired page was found to be busy earlier in + * the scan but has since completed. + */ +doclean: + if (tspc && tspc == spc) { + tspc = NULL; + } + spc->spc_flags = 0; + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + if (swap_pager_needflags & SWAP_FREE_NEEDED) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); + } + ++cleandone; + splx(s); + } + + return(tspc ? TRUE : FALSE); +} + +void +swap_pager_finish(spc) + register swp_clean_t spc; +{ + vm_object_t object = spc->spc_m[0]->object; + int i; + + if ((object->paging_in_progress -= spc->spc_count) == 0) + thread_wakeup((int) object); + + /* + * If no error mark as clean and inform the pmap system. + * If error, mark as dirty so we will try again. + * (XXX could get stuck doing this, should give up after awhile) + */ + if (spc->spc_flags & SPC_ERROR) { + for(i=0;i<spc->spc_count;i++) { + printf("swap_pager_finish: clean of page %x failed\n", + VM_PAGE_TO_PHYS(spc->spc_m[i])); + spc->spc_m[i]->flags |= PG_LAUNDRY; + } + } else { + for(i=0;i<spc->spc_count;i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i])); + spc->spc_m[i]->flags |= PG_CLEAN; + } + } + + + for(i=0;i<spc->spc_count;i++) { + /* + * we wakeup any processes that are waiting on + * these pages. + */ + PAGE_WAKEUP(spc->spc_m[i]); + } + nswiodone -= spc->spc_count; + + return; +} + +/* + * swap_pager_iodone + */ +void +swap_pager_iodone(bp) + register struct buf *bp; +{ + register swp_clean_t spc; + int s; + + s = splbio(); + spc = (swp_clean_t) bp->b_spc; + TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); + TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list); + if (bp->b_flags & B_ERROR) { + spc->spc_flags |= SPC_ERROR; + printf("error %d blkno %d sz %d ", + bp->b_error, bp->b_blkno, bp->b_bcount); + } + +/* + if ((bp->b_flags & B_READ) == 0) + vwakeup(bp); +*/ + + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC); + if (bp->b_vp) { + brelvp(bp); + } + if( bp->b_rcred != NOCRED) + crfree(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crfree(bp->b_wcred); + + nswiodone += spc->spc_count; + if (--spc->spc_swp->sw_poip == 0) { + wakeup((caddr_t)spc->spc_swp); + } + + if ((swap_pager_needflags & SWAP_FREE_NEEDED) || + swap_pager_inuse.tqh_first == 0) { + swap_pager_needflags &= ~SWAP_FREE_NEEDED; + wakeup((caddr_t)&swap_pager_free); + wakeup((caddr_t)&vm_pages_needed); + } + + if (vm_pageout_pages_needed) { + wakeup((caddr_t)&vm_pageout_pages_needed); + } + + if ((swap_pager_inuse.tqh_first == NULL) || + (cnt.v_free_count < cnt.v_free_min && + nswiodone + cnt.v_free_count >= cnt.v_free_min) ) { + wakeup((caddr_t)&vm_pages_needed); + } + splx(s); +} + +int bswneeded; +/* TAILQ_HEAD(swqueue, buf) bswlist; */ +/* + * allocate a physical buffer + */ +struct buf * +getpbuf() { + int s; + struct buf *bp; + + s = splbio(); + /* get a bp from the swap buffer header pool */ + while ((bp = bswlist.tqh_first) == NULL) { + bswneeded = 1; + tsleep((caddr_t)&bswneeded, PVM, "wswbuf", 0); + } + TAILQ_REMOVE(&bswlist, bp, b_freelist); + + splx(s); + + bzero(bp, sizeof *bp); + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + return bp; +} + +/* + * allocate a physical buffer, if one is available + */ +struct buf * +trypbuf() { + int s; + struct buf *bp; + + s = splbio(); + if ((bp = bswlist.tqh_first) == NULL) { + splx(s); + return NULL; + } + TAILQ_REMOVE(&bswlist, bp, b_freelist); + splx(s); + + bzero(bp, sizeof *bp); + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + return bp; +} + +/* + * release a physical buffer + */ +void +relpbuf(bp) + struct buf *bp; +{ + int s; + + s = splbio(); + + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + + if (bp->b_vp) + brelvp(bp); + + TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); + + if (bswneeded) { + bswneeded = 0; + wakeup((caddr_t)&bswlist); + } + splx(s); +} + +/* + * return true if any swap control structures can be allocated + */ +int +swap_pager_ready() { + if( swap_pager_free.tqh_first) + return 1; + else + return 0; +} diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h new file mode 100644 index 0000000..853edd5 --- /dev/null +++ b/sys/vm/swap_pager.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90 + * $Id: swap_pager.h,v 1.9 1994/03/14 21:54:23 davidg Exp $ + */ + +/* + * Modifications to the block allocation data structure by John S. Dyson + * 18 Dec 93. + */ + +#ifndef _SWAP_PAGER_ +#define _SWAP_PAGER_ 1 + +/* + * SWB_NPAGES can be set to any value from 1 to 16 pages per allocation, + * however, due to the allocation spilling into non-swap pager backed memory, + * suggest keeping SWB_NPAGES small (1-4). If high performance is manditory + * perhaps up to 8 pages might be in order???? + * Above problem has been fixed, now we support 16 pages per block. Unused + * space is recovered by the swap pager now... + */ +#define SWB_NPAGES 8 +struct swblock { + unsigned short swb_valid; /* bitmask for valid pages */ + unsigned short swb_locked; /* block locked */ + int swb_block[SWB_NPAGES]; /* unfortunately int instead of daddr_t */ +}; +typedef struct swblock *sw_blk_t; + +/* + * Swap pager private data. + */ +struct swpager { + vm_size_t sw_osize; /* size of object we are backing (bytes) */ + int sw_nblocks;/* number of blocks in list (sw_blk_t units) */ + sw_blk_t sw_blocks; /* pointer to list of swap blocks */ + short sw_flags; /* flags */ + short sw_poip; /* pageouts in progress */ + short sw_piip; /* pageins in progress */ +}; +typedef struct swpager *sw_pager_t; + +#define SW_WANTED 0x01 +#define SW_NAMED 0x02 + +#ifdef KERNEL + +void swap_pager_init(void); +vm_pager_t swap_pager_alloc(caddr_t, vm_size_t, vm_prot_t, vm_offset_t); +void swap_pager_dealloc(vm_pager_t); +boolean_t swap_pager_getpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_putpage(vm_pager_t, vm_page_t, boolean_t); +boolean_t swap_pager_getmulti(vm_pager_t, vm_page_t *, int, int, boolean_t); +boolean_t swap_pager_haspage(vm_pager_t, vm_offset_t); +int swap_pager_io(sw_pager_t, vm_page_t *, int, int, int); +void swap_pager_iodone(struct buf *); +boolean_t swap_pager_clean(); + +extern struct pagerops swappagerops; + +#endif + +#endif /* _SWAP_PAGER_ */ diff --git a/sys/vm/vm.h b/sys/vm/vm.h new file mode 100644 index 0000000..bc18dd2 --- /dev/null +++ b/sys/vm/vm.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm.h 8.2 (Berkeley) 12/13/93 + */ + +#ifndef VM_H +#define VM_H + +typedef char vm_inherit_t; /* XXX: inheritance codes */ + +union vm_map_object; +typedef union vm_map_object vm_map_object_t; + +struct vm_map_entry; +typedef struct vm_map_entry *vm_map_entry_t; + +struct vm_map; +typedef struct vm_map *vm_map_t; + +struct vm_object; +typedef struct vm_object *vm_object_t; + +struct vm_page; +typedef struct vm_page *vm_page_t; + +struct pager_struct; +typedef struct pager_struct *vm_pager_t; + +#include <sys/vmmeter.h> +#include <sys/queue.h> +#include <machine/cpufunc.h> +#include <vm/vm_param.h> +#include <vm/lock.h> +#include <vm/vm_prot.h> +#include <vm/vm_inherit.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> + +/* + * Shareable process virtual address space. + * May eventually be merged with vm_map. + * Several fields are temporary (text, data stuff). + */ +struct vmspace { + struct vm_map vm_map; /* VM address map */ + struct pmap vm_pmap; /* private physical map */ + int vm_refcnt; /* number of references */ + caddr_t vm_shm; /* SYS5 shared memory private data XXX */ +/* we copy from vm_startcopy to the end of the structure on fork */ +#define vm_startcopy vm_rssize + segsz_t vm_rssize; /* current resident set size in pages */ + segsz_t vm_swrss; /* resident set size before last swap */ + segsz_t vm_tsize; /* text size (pages) XXX */ + segsz_t vm_dsize; /* data size (pages) XXX */ + segsz_t vm_ssize; /* stack size (pages) */ + caddr_t vm_taddr; /* user virtual address of text XXX */ + caddr_t vm_daddr; /* user virtual address of data XXX */ + caddr_t vm_maxsaddr; /* user VA at max stack growth */ + caddr_t vm_minsaddr; /* user VA at max stack growth */ +}; +#endif /* VM_H */ diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h new file mode 100644 index 0000000..bc62e42 --- /dev/null +++ b/sys/vm/vm_extern.h @@ -0,0 +1,135 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 + */ + +struct buf; +struct loadavg; +struct proc; +struct vmspace; +struct vmtotal; +struct mount; +struct vnode; + +#ifdef KGDB +void chgkprot __P((caddr_t, int, int)); +#endif + +/* + * Try to get semi-meaningful wait messages into thread_sleep... + */ +extern void thread_sleep_(int, simple_lock_t, char *); +#if __GNUC__ >= 2 +#define thread_sleep(a,b,c) thread_sleep_((a), (b), __FUNCTION__) +#else +#define thread_sleep(a,b,c) thread_sleep_((a), (b), "vmslp") +#endif + +#ifdef KERNEL +#ifdef TYPEDEF_FOR_UAP +int getpagesize __P((struct proc *p, void *, int *)); +int madvise __P((struct proc *, void *, int *)); +int mincore __P((struct proc *, void *, int *)); +int mprotect __P((struct proc *, void *, int *)); +int msync __P((struct proc *, void *, int *)); +int munmap __P((struct proc *, void *, int *)); +int obreak __P((struct proc *, void *, int *)); +int sbrk __P((struct proc *, void *, int *)); +int smmap __P((struct proc *, void *, int *)); +int sstk __P((struct proc *, void *, int *)); +#endif + +void assert_wait __P((int, boolean_t)); +int grow __P((struct proc *, u_int)); +void iprintf __P((const char *, ...)); +int kernacc __P((caddr_t, int, int)); +int kinfo_loadavg __P((int, char *, int *, int, int *)); +int kinfo_meter __P((int, caddr_t, int *, int, int *)); +vm_offset_t kmem_alloc __P((vm_map_t, vm_size_t)); +vm_offset_t kmem_alloc_pageable __P((vm_map_t, vm_size_t)); +vm_offset_t kmem_alloc_wait __P((vm_map_t, vm_size_t)); +void kmem_free __P((vm_map_t, vm_offset_t, vm_size_t)); +void kmem_free_wakeup __P((vm_map_t, vm_offset_t, vm_size_t)); +void kmem_init __P((vm_offset_t, vm_offset_t)); +vm_offset_t kmem_malloc __P((vm_map_t, vm_size_t, boolean_t)); +vm_map_t kmem_suballoc __P((vm_map_t, vm_offset_t *, vm_offset_t *, + vm_size_t, boolean_t)); +void loadav __P((struct loadavg *)); +void munmapfd __P((int)); +int pager_cache __P((vm_object_t, boolean_t)); +void sched __P((void)); +int svm_allocate __P((struct proc *, void *, int *)); +int svm_deallocate __P((struct proc *, void *, int *)); +int svm_inherit __P((struct proc *, void *, int *)); +int svm_protect __P((struct proc *, void *, int *)); +void swapinit __P((void)); +int swapon __P((struct proc *, void *, int *)); +void swapout __P((struct proc *)); +void swapout_threads __P((void)); +int swfree __P((struct proc *, int)); +void swstrategy __P((struct buf *)); +void thread_block __P((char *)); +void thread_sleep __P((int, simple_lock_t, boolean_t)); +void thread_wakeup __P((int)); +int useracc __P((caddr_t, int, int)); +int vm_allocate __P((vm_map_t, + vm_offset_t *, vm_size_t, boolean_t)); +int vm_allocate_with_pager __P((vm_map_t, vm_offset_t *, + vm_size_t, boolean_t, vm_pager_t, vm_offset_t, boolean_t)); +int vm_deallocate __P((vm_map_t, vm_offset_t, vm_size_t)); +int vm_fault __P((vm_map_t, vm_offset_t, vm_prot_t, boolean_t)); +void vm_fault_copy_entry __P((vm_map_t, + vm_map_t, vm_map_entry_t, vm_map_entry_t)); +void vm_fault_unwire __P((vm_map_t, vm_offset_t, vm_offset_t)); +int vm_fault_wire __P((vm_map_t, vm_offset_t, vm_offset_t)); +int vm_fork __P((struct proc *, struct proc *, int)); +int vm_inherit __P((vm_map_t, + vm_offset_t, vm_size_t, vm_inherit_t)); +void vm_init_limits __P((struct proc *)); +void vm_mem_init __P((void)); +int vm_mmap __P((vm_map_t, vm_offset_t *, vm_size_t, + vm_prot_t, vm_prot_t, int, caddr_t, vm_offset_t)); +int vm_protect __P((vm_map_t, + vm_offset_t, vm_size_t, boolean_t, vm_prot_t)); +void vm_set_page_size __P((void)); +void vmmeter __P((void)); +struct vmspace *vmspace_alloc __P((vm_offset_t, vm_offset_t, int)); +struct vmspace *vmspace_fork __P((struct vmspace *)); +void vmspace_free __P((struct vmspace *)); +void vmtotal __P((struct vmtotal *)); +void vnode_pager_setsize __P((struct vnode *, u_long)); +void vnode_pager_umount __P((struct mount *)); +boolean_t vnode_pager_uncache __P((struct vnode *)); +void vslock __P((caddr_t, u_int)); +void vsunlock __P((caddr_t, u_int, int)); +#endif diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c new file mode 100644 index 0000000..3ce2d6e --- /dev/null +++ b/sys/vm/vm_fault.c @@ -0,0 +1,1305 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Page fault handling module. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + + +#define VM_FAULT_READ_AHEAD 4 +#define VM_FAULT_READ_AHEAD_MIN 1 +#define VM_FAULT_READ_BEHIND 3 +#define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1) +extern int swap_pager_full; +extern int vm_pageout_proc_limit; + +/* + * vm_fault: + * + * Handle a page fault occuring at the given address, + * requiring the given permissions, in the map specified. + * If successful, the page is inserted into the + * associated physical map. + * + * NOTE: the given address should be truncated to the + * proper page address. + * + * KERN_SUCCESS is returned if the page fault is handled; otherwise, + * a standard error specifying why the fault is fatal is returned. + * + * + * The map in question must be referenced, and remains so. + * Caller may hold no locks. + */ +int +vm_fault(map, vaddr, fault_type, change_wiring) + vm_map_t map; + vm_offset_t vaddr; + vm_prot_t fault_type; + boolean_t change_wiring; +{ + vm_object_t first_object; + vm_offset_t first_offset; + vm_map_entry_t entry; + register vm_object_t object; + register vm_offset_t offset; + vm_page_t m; + vm_page_t first_m; + vm_prot_t prot; + int result; + boolean_t wired; + boolean_t su; + boolean_t lookup_still_valid; + boolean_t page_exists; + vm_page_t old_m; + vm_object_t next_object; + vm_page_t marray[VM_FAULT_READ]; + int reqpage; + int spl; + int hardfault=0; + + cnt.v_faults++; /* needs lock XXX */ +/* + * Recovery actions + */ +#define FREE_PAGE(m) { \ + PAGE_WAKEUP(m); \ + vm_page_lock_queues(); \ + vm_page_free(m); \ + vm_page_unlock_queues(); \ +} + +#define RELEASE_PAGE(m) { \ + PAGE_WAKEUP(m); \ + vm_page_lock_queues(); \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ +} + +#define UNLOCK_MAP { \ + if (lookup_still_valid) { \ + vm_map_lookup_done(map, entry); \ + lookup_still_valid = FALSE; \ + } \ +} + +#define UNLOCK_THINGS { \ + object->paging_in_progress--; \ + if (object->paging_in_progress == 0) \ + wakeup((caddr_t)object); \ + vm_object_unlock(object); \ + if (object != first_object) { \ + vm_object_lock(first_object); \ + FREE_PAGE(first_m); \ + first_object->paging_in_progress--; \ + if (first_object->paging_in_progress == 0) \ + wakeup((caddr_t)first_object); \ + vm_object_unlock(first_object); \ + } \ + UNLOCK_MAP; \ +} + +#define UNLOCK_AND_DEALLOCATE { \ + UNLOCK_THINGS; \ + vm_object_deallocate(first_object); \ +} + + + RetryFault: ; + + /* + * Find the backing store object and offset into + * it to begin the search. + */ + + if ((result = vm_map_lookup(&map, vaddr, fault_type, &entry, + &first_object, &first_offset, + &prot, &wired, &su)) != KERN_SUCCESS) { + return(result); + } + lookup_still_valid = TRUE; + + if (wired) + fault_type = prot; + + first_m = NULL; + + /* + * Make a reference to this object to + * prevent its disposal while we are messing with + * it. Once we have the reference, the map is free + * to be diddled. Since objects reference their + * shadows (and copies), they will stay around as well. + */ + + vm_object_lock(first_object); + + first_object->ref_count++; + first_object->paging_in_progress++; + + /* + * INVARIANTS (through entire routine): + * + * 1) At all times, we must either have the object + * lock or a busy page in some object to prevent + * some other thread from trying to bring in + * the same page. + * + * Note that we cannot hold any locks during the + * pager access or when waiting for memory, so + * we use a busy page then. + * + * Note also that we aren't as concerned about + * more than one thead attempting to pager_data_unlock + * the same page at once, so we don't hold the page + * as busy then, but do record the highest unlock + * value so far. [Unlock requests may also be delivered + * out of order.] + * + * 2) Once we have a busy page, we must remove it from + * the pageout queues, so that the pageout daemon + * will not grab it away. + * + * 3) To prevent another thread from racing us down the + * shadow chain and entering a new page in the top + * object before we do, we must keep a busy page in + * the top object while following the shadow chain. + * + * 4) We must increment paging_in_progress on any object + * for which we have a busy page, to prevent + * vm_object_collapse from removing the busy page + * without our noticing. + */ + + /* + * Search for the page at object/offset. + */ + + object = first_object; + offset = first_offset; + + /* + * See whether this page is resident + */ + + while (TRUE) { + m = vm_page_lookup(object, offset); + if (m != NULL) { + /* + * If the page is being brought in, + * wait for it and then retry. + */ + if (m->flags & PG_BUSY) { + UNLOCK_THINGS; + if (m->flags & PG_BUSY) { + m->flags |= PG_WANTED; + tsleep((caddr_t)m,PSWP,"vmpfw",0); + } + vm_object_deallocate(first_object); + goto RetryFault; + } + + /* + * Remove the page from the pageout daemon's + * reach while we play with it. + */ + + vm_page_lock_queues(); + spl = splimp(); + if (m->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); + m->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + cnt.v_reactivated++; + } + + if (m->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + m->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + splx(spl); + vm_page_unlock_queues(); + + /* + * Mark page busy for other threads. + */ + m->flags |= PG_BUSY; + break; + } + + if (((object->pager != NULL) && + (!change_wiring || wired)) + || (object == first_object)) { + +#if 0 + if (curproc && (vaddr < VM_MAXUSER_ADDRESS) && + (curproc->p_rlimit[RLIMIT_RSS].rlim_max < + curproc->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG)) { + UNLOCK_AND_DEALLOCATE; + vm_fault_free_pages(curproc); + goto RetryFault; + } +#endif + + if (swap_pager_full && !object->shadow && (!object->pager || + (object->pager && object->pager->pg_type == PG_SWAP && + !vm_pager_has_page(object->pager, offset+object->paging_offset)))) { + if (vaddr < VM_MAXUSER_ADDRESS && curproc && curproc->p_pid >= 48) /* XXX */ { + printf("Process %d killed by vm_fault -- out of swap\n", curproc->p_pid); + psignal(curproc, SIGKILL); + curproc->p_estcpu = 0; + curproc->p_nice = PRIO_MIN; + setpriority(curproc); + } + } + + /* + * Allocate a new page for this object/offset + * pair. + */ + + m = vm_page_alloc(object, offset); + + if (m == NULL) { + UNLOCK_AND_DEALLOCATE; + VM_WAIT; + goto RetryFault; + } + } + + if (object->pager != NULL && (!change_wiring || wired)) { + int rv; + int faultcount; + int reqpage; + + /* + * Now that we have a busy page, we can + * release the object lock. + */ + vm_object_unlock(object); + /* + * now we find out if any other pages should + * be paged in at this time + * this routine checks to see if the pages surrounding this fault + * reside in the same object as the page for this fault. If + * they do, then they are faulted in also into the + * object. The array "marray" returned contains an array of + * vm_page_t structs where one of them is the vm_page_t passed to + * the routine. The reqpage return value is the index into the + * marray for the vm_page_t passed to the routine. + */ + cnt.v_pageins++; + faultcount = vm_fault_additional_pages(first_object, first_offset, + m, VM_FAULT_READ_BEHIND, VM_FAULT_READ_AHEAD, marray, &reqpage); + + /* + * Call the pager to retrieve the data, if any, + * after releasing the lock on the map. + */ + UNLOCK_MAP; + + rv = faultcount ? + vm_pager_get_pages(object->pager, + marray, faultcount, reqpage, TRUE): VM_PAGER_FAIL; + if (rv == VM_PAGER_OK) { + /* + * Found the page. + * Leave it busy while we play with it. + */ + vm_object_lock(object); + + /* + * Relookup in case pager changed page. + * Pager is responsible for disposition + * of old page if moved. + */ + m = vm_page_lookup(object, offset); + + cnt.v_pgpgin++; + m->flags &= ~PG_FAKE; + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + hardfault++; + break; + } + + /* + * Remove the bogus page (which does not + * exist at this object/offset); before + * doing so, we must get back our object + * lock to preserve our invariant. + * + * Also wake up any other thread that may want + * to bring in this page. + * + * If this is the top-level object, we must + * leave the busy page to prevent another + * thread from rushing past us, and inserting + * the page in that object at the same time + * that we are. + */ + + vm_object_lock(object); + /* + * Data outside the range of the pager; an error + */ + if ((rv == VM_PAGER_ERROR) || (rv == VM_PAGER_BAD)) { + FREE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + return(KERN_PROTECTION_FAILURE); /* XXX */ + } + if (object != first_object) { + FREE_PAGE(m); + /* + * XXX - we cannot just fall out at this + * point, m has been freed and is invalid! + */ + } + } + + /* + * We get here if the object has no pager (or unwiring) + * or the pager doesn't have the page. + */ + if (object == first_object) + first_m = m; + + /* + * Move on to the next object. Lock the next + * object before unlocking the current one. + */ + + offset += object->shadow_offset; + next_object = object->shadow; + if (next_object == NULL) { + /* + * If there's no object left, fill the page + * in the top object with zeros. + */ + if (object != first_object) { + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); + vm_object_unlock(object); + + object = first_object; + offset = first_offset; + m = first_m; + vm_object_lock(object); + } + first_m = NULL; + + vm_page_zero_fill(m); + cnt.v_zfod++; + m->flags &= ~PG_FAKE; + break; + } + else { + vm_object_lock(next_object); + if (object != first_object) { + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); + } + vm_object_unlock(object); + object = next_object; + object->paging_in_progress++; + } + } + + if ((m->flags & (PG_ACTIVE|PG_INACTIVE) != 0) || + (m->flags & PG_BUSY) == 0) + panic("vm_fault: absent or active or inactive or not busy after main loop"); + + /* + * PAGE HAS BEEN FOUND. + * [Loop invariant still holds -- the object lock + * is held.] + */ + + old_m = m; /* save page that would be copied */ + + /* + * If the page is being written, but isn't + * already owned by the top-level object, + * we have to copy it into a new page owned + * by the top-level object. + */ + + if (object != first_object) { + /* + * We only really need to copy if we + * want to write it. + */ + + if (fault_type & VM_PROT_WRITE) { + + /* + * If we try to collapse first_object at this + * point, we may deadlock when we try to get + * the lock on an intermediate object (since we + * have the bottom object locked). We can't + * unlock the bottom object, because the page + * we found may move (by collapse) if we do. + * + * Instead, we first copy the page. Then, when + * we have no more use for the bottom object, + * we unlock it and try to collapse. + * + * Note that we copy the page even if we didn't + * need to... that's the breaks. + */ + + /* + * We already have an empty page in + * first_object - use it. + */ + + vm_page_copy(m, first_m); + first_m->flags &= ~PG_FAKE; + + /* + * If another map is truly sharing this + * page with us, we have to flush all + * uses of the original page, since we + * can't distinguish those which want the + * original from those which need the + * new copy. + * + * XXX If we know that only one map has + * access to this page, then we could + * avoid the pmap_page_protect() call. + */ + + vm_page_lock_queues(); + + vm_page_activate(m); + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); + if ((m->flags & PG_CLEAN) == 0) + m->flags |= PG_LAUNDRY; + vm_page_unlock_queues(); + + /* + * We no longer need the old page or object. + */ + PAGE_WAKEUP(m); + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); + vm_object_unlock(object); + + /* + * Only use the new page below... + */ + + cnt.v_cow_faults++; + m = first_m; + object = first_object; + offset = first_offset; + + /* + * Now that we've gotten the copy out of the + * way, let's try to collapse the top object. + */ + vm_object_lock(object); + /* + * But we have to play ugly games with + * paging_in_progress to do that... + */ + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t) object); + vm_object_collapse(object); + object->paging_in_progress++; + } + else { + prot &= ~VM_PROT_WRITE; + m->flags |= PG_COPYONWRITE; + } + } + + if (m->flags & (PG_ACTIVE|PG_INACTIVE)) + panic("vm_fault: active or inactive before copy object handling"); + + /* + * If the page is being written, but hasn't been + * copied to the copy-object, we have to copy it there. + */ + RetryCopy: + if (first_object->copy != NULL) { + vm_object_t copy_object = first_object->copy; + vm_offset_t copy_offset; + vm_page_t copy_m; + + /* + * We only need to copy if we want to write it. + */ + if ((fault_type & VM_PROT_WRITE) == 0) { + prot &= ~VM_PROT_WRITE; + m->flags |= PG_COPYONWRITE; + } + else { + /* + * Try to get the lock on the copy_object. + */ + if (!vm_object_lock_try(copy_object)) { + vm_object_unlock(object); + /* should spin a bit here... */ + vm_object_lock(object); + goto RetryCopy; + } + + /* + * Make another reference to the copy-object, + * to keep it from disappearing during the + * copy. + */ + copy_object->ref_count++; + + /* + * Does the page exist in the copy? + */ + copy_offset = first_offset + - copy_object->shadow_offset; + copy_m = vm_page_lookup(copy_object, copy_offset); + if (page_exists = (copy_m != NULL)) { + if (copy_m->flags & PG_BUSY) { + /* + * If the page is being brought + * in, wait for it and then retry. + */ + PAGE_ASSERT_WAIT(copy_m, !change_wiring); + RELEASE_PAGE(m); + copy_object->ref_count--; + vm_object_unlock(copy_object); + UNLOCK_THINGS; + thread_block("fltcpy"); + vm_object_deallocate(first_object); + goto RetryFault; + } + } + + /* + * If the page is not in memory (in the object) + * and the object has a pager, we have to check + * if the pager has the data in secondary + * storage. + */ + if (!page_exists) { + + /* + * If we don't allocate a (blank) page + * here... another thread could try + * to page it in, allocate a page, and + * then block on the busy page in its + * shadow (first_object). Then we'd + * trip over the busy page after we + * found that the copy_object's pager + * doesn't have the page... + */ + copy_m = vm_page_alloc(copy_object, copy_offset); + if (copy_m == NULL) { + /* + * Wait for a page, then retry. + */ + RELEASE_PAGE(m); + copy_object->ref_count--; + vm_object_unlock(copy_object); + UNLOCK_AND_DEALLOCATE; + VM_WAIT; + goto RetryFault; + } + + if (copy_object->pager != NULL) { + vm_object_unlock(object); + vm_object_unlock(copy_object); + UNLOCK_MAP; + + page_exists = vm_pager_has_page( + copy_object->pager, + (copy_offset + copy_object->paging_offset)); + + vm_object_lock(copy_object); + + /* + * Since the map is unlocked, someone + * else could have copied this object + * and put a different copy_object + * between the two. Or, the last + * reference to the copy-object (other + * than the one we have) may have + * disappeared - if that has happened, + * we don't need to make the copy. + */ + if (copy_object->shadow != object || + copy_object->ref_count == 1) { + /* + * Gaah... start over! + */ + FREE_PAGE(copy_m); + vm_object_unlock(copy_object); + vm_object_deallocate(copy_object); + /* may block */ + vm_object_lock(object); + goto RetryCopy; + } + vm_object_lock(object); + + if (page_exists) { + /* + * We didn't need the page + */ + FREE_PAGE(copy_m); + } + } + } + if (!page_exists) { + /* + * Must copy page into copy-object. + */ + vm_page_copy(m, copy_m); + copy_m->flags &= ~PG_FAKE; + + /* + * Things to remember: + * 1. The copied page must be marked 'dirty' + * so it will be paged out to the copy + * object. + * 2. If the old page was in use by any users + * of the copy-object, it must be removed + * from all pmaps. (We can't know which + * pmaps use it.) + */ + vm_page_lock_queues(); + + vm_page_activate(old_m); + + + pmap_page_protect(VM_PAGE_TO_PHYS(old_m), + VM_PROT_NONE); + if ((old_m->flags & PG_CLEAN) == 0) + old_m->flags |= PG_LAUNDRY; + copy_m->flags &= ~PG_CLEAN; + vm_page_activate(copy_m); + vm_page_unlock_queues(); + + PAGE_WAKEUP(copy_m); + } + /* + * The reference count on copy_object must be + * at least 2: one for our extra reference, + * and at least one from the outside world + * (we checked that when we last locked + * copy_object). + */ + copy_object->ref_count--; + vm_object_unlock(copy_object); + m->flags &= ~PG_COPYONWRITE; + } + } + + if (m->flags & (PG_ACTIVE | PG_INACTIVE)) + panic("vm_fault: active or inactive before retrying lookup"); + + /* + * We must verify that the maps have not changed + * since our last lookup. + */ + + if (!lookup_still_valid) { + vm_object_t retry_object; + vm_offset_t retry_offset; + vm_prot_t retry_prot; + + /* + * Since map entries may be pageable, make sure we can + * take a page fault on them. + */ + vm_object_unlock(object); + + /* + * To avoid trying to write_lock the map while another + * thread has it read_locked (in vm_map_pageable), we + * do not try for write permission. If the page is + * still writable, we will get write permission. If it + * is not, or has been marked needs_copy, we enter the + * mapping without write permission, and will merely + * take another fault. + */ + result = vm_map_lookup(&map, vaddr, + fault_type & ~VM_PROT_WRITE, &entry, + &retry_object, &retry_offset, &retry_prot, + &wired, &su); + + vm_object_lock(object); + + /* + * If we don't need the page any longer, put it on the + * active list (the easiest thing to do here). If no + * one needs it, pageout will grab it eventually. + */ + + if (result != KERN_SUCCESS) { + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + return(result); + } + + lookup_still_valid = TRUE; + + if ((retry_object != first_object) || + (retry_offset != first_offset)) { + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto RetryFault; + } + + /* + * Check whether the protection has changed or the object + * has been copied while we left the map unlocked. + * Changing from read to write permission is OK - we leave + * the page write-protected, and catch the write fault. + * Changing from write to read permission means that we + * can't mark the page write-enabled after all. + */ + prot &= retry_prot; + if (m->flags & PG_COPYONWRITE) + prot &= ~VM_PROT_WRITE; + } + + /* + * (the various bits we're fiddling with here are locked by + * the object's lock) + */ + + /* XXX This distorts the meaning of the copy_on_write bit */ + + if (prot & VM_PROT_WRITE) + m->flags &= ~PG_COPYONWRITE; + + /* + * It's critically important that a wired-down page be faulted + * only once in each map for which it is wired. + */ + + if (m->flags & (PG_ACTIVE | PG_INACTIVE)) + panic("vm_fault: active or inactive before pmap_enter"); + + vm_object_unlock(object); + + /* + * Put this page into the physical map. + * We had to do the unlock above because pmap_enter + * may cause other faults. We don't put the + * page back on the active queue until later so + * that the page-out daemon won't find us (yet). + */ + + pmap_enter(map->pmap, vaddr, VM_PAGE_TO_PHYS(m), prot, wired); + + /* + * If the page is not wired down, then put it where the + * pageout daemon can find it. + */ + vm_object_lock(object); + vm_page_lock_queues(); + if (change_wiring) { + if (wired) + vm_page_wire(m); + else + vm_page_unwire(m); + } + else { + vm_page_activate(m); + } + + if( curproc && curproc->p_stats) { + if (hardfault) { + curproc->p_stats->p_ru.ru_majflt++; + } else { + curproc->p_stats->p_ru.ru_minflt++; + } + } + + vm_page_unlock_queues(); + + /* + * Unlock everything, and return + */ + + PAGE_WAKEUP(m); + UNLOCK_AND_DEALLOCATE; + + return(KERN_SUCCESS); + +} + +/* + * vm_fault_wire: + * + * Wire down a range of virtual addresses in a map. + */ +int +vm_fault_wire(map, start, end) + vm_map_t map; + vm_offset_t start, end; +{ + + register vm_offset_t va; + register pmap_t pmap; + int rv; + + pmap = vm_map_pmap(map); + + /* + * Inform the physical mapping system that the + * range of addresses may not fault, so that + * page tables and such can be locked down as well. + */ + + pmap_pageable(pmap, start, end, FALSE); + + /* + * We simulate a fault to get the page and enter it + * in the physical map. + */ + + for (va = start; va < end; va += PAGE_SIZE) { + rv = vm_fault(map, va, VM_PROT_NONE, TRUE); + if (rv) { + if (va != start) + vm_fault_unwire(map, start, va); + return(rv); + } + } + return(KERN_SUCCESS); +} + + +/* + * vm_fault_unwire: + * + * Unwire a range of virtual addresses in a map. + */ +void +vm_fault_unwire(map, start, end) + vm_map_t map; + vm_offset_t start, end; +{ + + register vm_offset_t va, pa; + register pmap_t pmap; + + pmap = vm_map_pmap(map); + + /* + * Since the pages are wired down, we must be able to + * get their mappings from the physical map system. + */ + + vm_page_lock_queues(); + + for (va = start; va < end; va += PAGE_SIZE) { + pa = pmap_extract(pmap, va); + if (pa == (vm_offset_t) 0) { + panic("unwire: page not in pmap"); + } + pmap_change_wiring(pmap, va, FALSE); + vm_page_unwire(PHYS_TO_VM_PAGE(pa)); + } + vm_page_unlock_queues(); + + /* + * Inform the physical mapping system that the range + * of addresses may fault, so that page tables and + * such may be unwired themselves. + */ + + pmap_pageable(pmap, start, end, TRUE); + +} + +/* + * Routine: + * vm_fault_copy_entry + * Function: + * Copy all of the pages from a wired-down map entry to another. + * + * In/out conditions: + * The source and destination maps must be locked for write. + * The source map entry must be wired down (or be a sharing map + * entry corresponding to a main map entry that is wired down). + */ + +void +vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) + vm_map_t dst_map; + vm_map_t src_map; + vm_map_entry_t dst_entry; + vm_map_entry_t src_entry; +{ + vm_object_t dst_object; + vm_object_t src_object; + vm_offset_t dst_offset; + vm_offset_t src_offset; + vm_prot_t prot; + vm_offset_t vaddr; + vm_page_t dst_m; + vm_page_t src_m; + +#ifdef lint + src_map++; +#endif lint + + src_object = src_entry->object.vm_object; + src_offset = src_entry->offset; + + /* + * Create the top-level object for the destination entry. + * (Doesn't actually shadow anything - we copy the pages + * directly.) + */ + dst_object = vm_object_allocate( + (vm_size_t) (dst_entry->end - dst_entry->start)); + + dst_entry->object.vm_object = dst_object; + dst_entry->offset = 0; + + prot = dst_entry->max_protection; + + /* + * Loop through all of the pages in the entry's range, copying + * each one from the source object (it should be there) to the + * destination object. + */ + for (vaddr = dst_entry->start, dst_offset = 0; + vaddr < dst_entry->end; + vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { + + /* + * Allocate a page in the destination object + */ + vm_object_lock(dst_object); + do { + dst_m = vm_page_alloc(dst_object, dst_offset); + if (dst_m == NULL) { + vm_object_unlock(dst_object); + VM_WAIT; + vm_object_lock(dst_object); + } + } while (dst_m == NULL); + + /* + * Find the page in the source object, and copy it in. + * (Because the source is wired down, the page will be + * in memory.) + */ + vm_object_lock(src_object); + src_m = vm_page_lookup(src_object, dst_offset + src_offset); + if (src_m == NULL) + panic("vm_fault_copy_wired: page missing"); + + vm_page_copy(src_m, dst_m); + + /* + * Enter it in the pmap... + */ + vm_object_unlock(src_object); + vm_object_unlock(dst_object); + + pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m), + prot, FALSE); + + /* + * Mark it no longer busy, and put it on the active list. + */ + vm_object_lock(dst_object); + vm_page_lock_queues(); + vm_page_activate(dst_m); + vm_page_unlock_queues(); + PAGE_WAKEUP(dst_m); + vm_object_unlock(dst_object); + } +} + + +/* + * looks page up in shadow chain + */ + +int +vm_fault_page_lookup(object, offset, rtobject, rtoffset, rtm) + vm_object_t object; + vm_offset_t offset; + vm_object_t *rtobject; + vm_offset_t *rtoffset; + vm_page_t *rtm; +{ + vm_page_t m; + vm_object_t first_object = object; + + *rtm = 0; + *rtobject = 0; + *rtoffset = 0; + + + while (!(m=vm_page_lookup(object, offset))) { + if (object->pager) { + if (vm_pager_has_page(object->pager, object->paging_offset+offset)) { + *rtobject = object; + *rtoffset = offset; + return 1; + } + } + + if (!object->shadow) + return 0; + else { + offset += object->shadow_offset; + object = object->shadow; + } + } + *rtobject = object; + *rtoffset = offset; + *rtm = m; + return 1; +} + +/* + * This routine checks around the requested page for other pages that + * might be able to be faulted in. + * + * Inputs: + * first_object, first_offset, m, rbehind, rahead + * + * Outputs: + * marray (array of vm_page_t), reqpage (index of requested page) + * + * Return value: + * number of pages in marray + */ +int +vm_fault_additional_pages(first_object, first_offset, m, rbehind, raheada, marray, reqpage) + vm_object_t first_object; + vm_offset_t first_offset; + vm_page_t m; + int rbehind; + int raheada; + vm_page_t *marray; + int *reqpage; +{ + int i; + vm_page_t tmpm; + vm_object_t object; + vm_offset_t offset, startoffset, endoffset, toffset, size; + vm_object_t rtobject; + vm_page_t rtm; + vm_offset_t rtoffset; + vm_offset_t offsetdiff; + int rahead; + int treqpage; + + object = m->object; + offset = m->offset; + + offsetdiff = offset - first_offset; + + /* + * if the requested page is not available, then give up now + */ + + if (!vm_pager_has_page(object->pager, object->paging_offset+offset)) + return 0; + + /* + * if there is no getmulti routine for this pager, then just allow + * one page to be read. + */ +/* + if (!object->pager->pg_ops->pgo_getpages) { + *reqpage = 0; + marray[0] = m; + return 1; + } +*/ + + /* + * try to do any readahead that we might have free pages for. + */ + rahead = raheada; + if (rahead > (cnt.v_free_count - cnt.v_free_reserved)) { + rahead = cnt.v_free_count - cnt.v_free_reserved; + rbehind = 0; + } + + if (cnt.v_free_count < cnt.v_free_min) { + if (rahead > VM_FAULT_READ_AHEAD_MIN) + rahead = VM_FAULT_READ_AHEAD_MIN; + rbehind = 0; + } + + /* + * if we don't have any free pages, then just read one page. + */ + if (rahead <= 0) { + *reqpage = 0; + marray[0] = m; + return 1; + } + + /* + * scan backward for the read behind pages -- + * in memory or on disk not in same object + */ + toffset = offset - NBPG; + if( rbehind*NBPG > offset) + rbehind = offset / NBPG; + startoffset = offset - rbehind*NBPG; + while (toffset >= startoffset) { + if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) || + rtm != 0 || rtobject != object) { + startoffset = toffset + NBPG; + break; + } + if( toffset == 0) + break; + toffset -= NBPG; + } + + /* + * scan forward for the read ahead pages -- + * in memory or on disk not in same object + */ + toffset = offset + NBPG; + endoffset = offset + (rahead+1)*NBPG; + while (toffset < object->size && toffset < endoffset) { + if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) || + rtm != 0 || rtobject != object) { + break; + } + toffset += NBPG; + } + endoffset = toffset; + + /* calculate number of bytes of pages */ + size = (endoffset - startoffset) / NBPG; + + /* calculate the page offset of the required page */ + treqpage = (offset - startoffset) / NBPG; + + /* see if we have space (again) */ + if (cnt.v_free_count >= cnt.v_free_reserved + size) { + bzero(marray, (rahead + rbehind + 1) * sizeof(vm_page_t)); + /* + * get our pages and don't block for them + */ + for (i = 0; i < size; i++) { + if (i != treqpage) + rtm = vm_page_alloc(object, startoffset + i * NBPG); + else + rtm = m; + marray[i] = rtm; + } + + for (i = 0; i < size; i++) { + if (marray[i] == 0) + break; + } + + /* + * if we could not get our block of pages, then + * free the readahead/readbehind pages. + */ + if (i < size) { + for (i = 0; i < size; i++) { + if (i != treqpage && marray[i]) + FREE_PAGE(marray[i]); + } + *reqpage = 0; + marray[0] = m; + return 1; + } + + *reqpage = treqpage; + return size; + } + *reqpage = 0; + marray[0] = m; + return 1; +} + diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c new file mode 100644 index 0000000..f181ab0 --- /dev/null +++ b/sys/vm/vm_glue.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/buf.h> +#include <sys/user.h> + +#include <sys/kernel.h> +#include <sys/dkstat.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/vm_kern.h> + +#include <machine/stdarg.h> + +extern char kstack[]; +int avefree = 0; /* XXX */ +int readbuffers = 0; /* XXX allow kgdb to read kernel buffer pool */ +/* vm_map_t upages_map; */ + +void swapout(struct proc *p); +int +kernacc(addr, len, rw) + caddr_t addr; + int len, rw; +{ + boolean_t rv; + vm_offset_t saddr, eaddr; + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + + saddr = trunc_page(addr); + eaddr = round_page(addr+len); + rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); + return(rv == TRUE); +} + +int +useracc(addr, len, rw) + caddr_t addr; + int len, rw; +{ + boolean_t rv; + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + + /* + * XXX - specially disallow access to user page tables - they are + * in the map. + * + * XXX - don't specially disallow access to the user area - treat + * it as incorrectly as elsewhere. + * + * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was + * only used (as an end address) in trap.c. Use it as an end + * address here too. + */ + if ((vm_offset_t) addr >= VM_MAXUSER_ADDRESS + || (vm_offset_t) addr + len > VM_MAXUSER_ADDRESS + || (vm_offset_t) addr + len <= (vm_offset_t) addr) { + return (FALSE); + } + + rv = vm_map_check_protection(&curproc->p_vmspace->vm_map, + trunc_page(addr), round_page(addr+len), prot); + return(rv == TRUE); +} + +#ifdef KGDB +/* + * Change protections on kernel pages from addr to addr+len + * (presumably so debugger can plant a breakpoint). + * All addresses are assumed to reside in the Sysmap, + */ +chgkprot(addr, len, rw) + register caddr_t addr; + int len, rw; +{ + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + + vm_map_protect(kernel_map, trunc_page(addr), + round_page(addr+len), prot, FALSE); +} +#endif +void +vslock(addr, len) + caddr_t addr; + u_int len; +{ + vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), + round_page(addr+len), FALSE); +} + +void +vsunlock(addr, len, dirtied) + caddr_t addr; + u_int len; + int dirtied; +{ +#ifdef lint + dirtied++; +#endif lint + vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), + round_page(addr+len), TRUE); +} + +/* + * Implement fork's actions on an address space. + * Here we arrange for the address space to be copied or referenced, + * allocate a user struct (pcb and kernel stack), then call the + * machine-dependent layer to fill those in and make the new process + * ready to run. + * NOTE: the kernel stack may be at a different location in the child + * process, and thus addresses of automatic variables may be invalid + * after cpu_fork returns in the child process. We do nothing here + * after cpu_fork returns. + */ +int +vm_fork(p1, p2, isvfork) + register struct proc *p1, *p2; + int isvfork; +{ + register struct user *up; + vm_offset_t addr, ptaddr; + int i; + struct vm_map *vp; + + while( cnt.v_free_count < cnt.v_free_min) + VM_WAIT; + + /* + * avoid copying any of the parent's pagetables or other per-process + * objects that reside in the map by marking all of them non-inheritable + */ + (void)vm_map_inherit(&p1->p_vmspace->vm_map, + UPT_MIN_ADDRESS - UPAGES * NBPG, VM_MAX_ADDRESS, VM_INHERIT_NONE); + p2->p_vmspace = vmspace_fork(p1->p_vmspace); + +#ifdef SYSVSHM + if (p1->p_vmspace->vm_shm) + shmfork(p1, p2, isvfork); +#endif + + /* + * Allocate a wired-down (for now) pcb and kernel stack for the process + */ + + addr = (vm_offset_t) kstack; + + vp = &p2->p_vmspace->vm_map; + + /* ream out old pagetables and kernel stack */ + (void)vm_deallocate(vp, addr, UPT_MAX_ADDRESS - addr); + + /* get new pagetables and kernel stack */ + (void)vm_allocate(vp, &addr, UPT_MAX_ADDRESS - addr, FALSE); + + /* force in the page table encompassing the UPAGES */ + ptaddr = trunc_page((u_int)vtopte(addr)); + vm_map_pageable(vp, ptaddr, ptaddr + NBPG, FALSE); + + /* and force in (demand-zero) the UPAGES */ + vm_map_pageable(vp, addr, addr + UPAGES * NBPG, FALSE); + + /* get a kernel virtual address for the UPAGES for this proc */ + up = (struct user *)kmem_alloc_pageable(kernel_map, UPAGES * NBPG); + + /* and force-map the upages into the kernel pmap */ + for (i = 0; i < UPAGES; i++) + pmap_enter(vm_map_pmap(kernel_map), + ((vm_offset_t) up) + NBPG * i, + pmap_extract(vp->pmap, addr + NBPG * i), + VM_PROT_READ|VM_PROT_WRITE, 1); + + /* and allow the UPAGES page table entry to be paged (at the vm system level) */ + vm_map_pageable(vp, ptaddr, ptaddr + NBPG, TRUE); + + p2->p_addr = up; + + /* + * p_stats and p_sigacts currently point at fields + * in the user struct but not at &u, instead at p_addr. + * Copy p_sigacts and parts of p_stats; zero the rest + * of p_stats (statistics). + */ + p2->p_stats = &up->u_stats; + p2->p_sigacts = &up->u_sigacts; + up->u_sigacts = *p1->p_sigacts; + bzero(&up->u_stats.pstat_startzero, + (unsigned) ((caddr_t)&up->u_stats.pstat_endzero - + (caddr_t)&up->u_stats.pstat_startzero)); + bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, + ((caddr_t)&up->u_stats.pstat_endcopy - + (caddr_t)&up->u_stats.pstat_startcopy)); + + + /* + * cpu_fork will copy and update the kernel stack and pcb, + * and make the child ready to run. It marks the child + * so that it can return differently than the parent. + * It returns twice, once in the parent process and + * once in the child. + */ + return (cpu_fork(p1, p2)); +} + +/* + * Set default limits for VM system. + * Called for proc 0, and then inherited by all others. + */ +void +vm_init_limits(p) + register struct proc *p; +{ + int tmp; + + /* + * Set up the initial limits on process VM. + * Set the maximum resident set size to be all + * of (reasonably) available memory. This causes + * any single, large process to start random page + * replacement once it fills memory. + */ + p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; + p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; + p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; + p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; + tmp = ((2 * cnt.v_free_count) / 3) - 32; + if (cnt.v_free_count < 512) + tmp = cnt.v_free_count; + p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(tmp); + p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; +} + +#ifdef DEBUG +int enableswap = 1; +int swapdebug = 0; +#define SDB_FOLLOW 1 +#define SDB_SWAPIN 2 +#define SDB_SWAPOUT 4 +#endif + +void +faultin(p) +struct proc *p; +{ + vm_offset_t i; + vm_offset_t vaddr, ptaddr; + vm_offset_t v, v1; + struct user *up; + int s; + int opflag; + + if ((p->p_flag & P_INMEM) == 0) { + int rv0, rv1; + vm_map_t map; + + ++p->p_lock; + + map = &p->p_vmspace->vm_map; + /* force the page table encompassing the kernel stack (upages) */ + ptaddr = trunc_page((u_int)vtopte(kstack)); + vm_map_pageable(map, ptaddr, ptaddr + NBPG, FALSE); + + /* wire in the UPAGES */ + vm_map_pageable(map, (vm_offset_t) kstack, + (vm_offset_t) kstack + UPAGES * NBPG, FALSE); + + /* and map them nicely into the kernel pmap */ + for (i = 0; i < UPAGES; i++) { + vm_offset_t off = i * NBPG; + vm_offset_t pa = (vm_offset_t) + pmap_extract(&p->p_vmspace->vm_pmap, + (vm_offset_t) kstack + off); + pmap_enter(vm_map_pmap(kernel_map), + ((vm_offset_t)p->p_addr) + off, + pa, VM_PROT_READ|VM_PROT_WRITE, 1); + } + + /* and let the page table pages go (at least above pmap level) */ + vm_map_pageable(map, ptaddr, ptaddr + NBPG, TRUE); + + s = splhigh(); + + if (p->p_stat == SRUN) + setrunqueue(p); + + p->p_flag |= P_INMEM; + + /* undo the effect of setting SLOCK above */ + --p->p_lock; + splx(s); + + } + +} + +int swapinreq; +int percentactive; +/* + * This swapin algorithm attempts to swap-in processes only if there + * is enough space for them. Of course, if a process waits for a long + * time, it will be swapped in anyway. + */ +void +scheduler() +{ + register struct proc *p; + register int pri; + struct proc *pp; + int ppri; + vm_offset_t addr; + int lastidle, lastrun; + int curidle, currun; + int forceload; + int percent; + int ntries; + + lastidle = 0; + lastrun = 0; + +loop: + ntries = 0; + vmmeter(); + + curidle = cp_time[CP_IDLE]; + currun = cp_time[CP_USER] + cp_time[CP_SYS] + cp_time[CP_NICE]; + percent = (100*(currun-lastrun)) / ( 1 + (currun-lastrun) + (curidle-lastidle)); + lastrun = currun; + lastidle = curidle; + if( percent > 100) + percent = 100; + percentactive = percent; + + if( percentactive < 25) + forceload = 1; + else + forceload = 0; + +loop1: + pp = NULL; + ppri = INT_MIN; + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_stat == SRUN && (p->p_flag & P_INMEM) == 0) { + int mempri; + pri = p->p_swtime + p->p_slptime - p->p_nice * 8; + mempri = pri > 0 ? pri : 0; + /* + * if this process is higher priority and there is + * enough space, then select this process instead + * of the previous selection. + */ + if (pri > ppri && + (((cnt.v_free_count + (mempri * (4*PAGE_SIZE) / PAGE_SIZE) >= (p->p_vmspace->vm_swrss)) || (ntries > 0 && forceload)))) { + pp = p; + ppri = pri; + } + } + } + + if ((pp == NULL) && (ntries == 0) && forceload) { + ++ntries; + goto loop1; + } + + /* + * Nothing to do, back to sleep + */ + if ((p = pp) == NULL) { + tsleep((caddr_t)&proc0, PVM, "sched", 0); + goto loop; + } + + /* + * We would like to bring someone in. (only if there is space). + */ +/* + printf("swapin: %d, free: %d, res: %d, min: %d\n", + p->p_pid, cnt.v_free_count, cnt.v_free_reserved, cnt.v_free_min); +*/ + (void) splhigh(); + if ((forceload && (cnt.v_free_count > (cnt.v_free_reserved + UPAGES + 1))) || + (cnt.v_free_count >= cnt.v_free_min)) { + spl0(); + faultin(p); + p->p_swtime = 0; + goto loop; + } + /* + * log the memory shortage + */ + swapinreq += p->p_vmspace->vm_swrss; + /* + * Not enough memory, jab the pageout daemon and wait til the + * coast is clear. + */ + if( cnt.v_free_count < cnt.v_free_min) { + VM_WAIT; + } else { + tsleep((caddr_t)&proc0, PVM, "sched", 0); + } + (void) spl0(); + goto loop; +} + +#define swappable(p) \ + (((p)->p_lock == 0) && \ + ((p)->p_flag & (P_TRACED|P_NOSWAP|P_SYSTEM|P_INMEM|P_WEXIT|P_PHYSIO)) == P_INMEM) + +extern int vm_pageout_free_min; +/* + * Swapout is driven by the pageout daemon. Very simple, we find eligible + * procs and unwire their u-areas. We try to always "swap" at least one + * process in case we need the room for a swapin. + * If any procs have been sleeping/stopped for at least maxslp seconds, + * they are swapped. Else, we swap the longest-sleeping or stopped process, + * if any, otherwise the longest-resident process. + */ +void +swapout_threads() +{ + register struct proc *p; + struct proc *outp, *outp2; + int outpri, outpri2; + int tpri; + int didswap = 0; + int swapneeded = swapinreq; + extern int maxslp; + int runnablenow; + int s; + +swapmore: + runnablenow = 0; + outp = outp2 = NULL; + outpri = outpri2 = INT_MIN; + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (!swappable(p)) + continue; + switch (p->p_stat) { + case SRUN: + ++runnablenow; + /* + * count the process as being in a runnable state + */ + if ((tpri = p->p_swtime + p->p_nice * 8) > outpri2) { + outp2 = p; + outpri2 = tpri; + } + continue; + + case SSLEEP: + case SSTOP: + /* + * do not swapout a process that is waiting for VM datastructures + * there is a possible deadlock. + */ + if (!lock_try_write( &p->p_vmspace->vm_map.lock)) { + continue; + } + vm_map_unlock( &p->p_vmspace->vm_map); + if (p->p_slptime > maxslp) { + swapout(p); + didswap++; + } else if ((tpri = p->p_slptime + p->p_nice * 8) > outpri) { + outp = p; + outpri = tpri ; + } + continue; + } + } + /* + * We swapout only if there are more than two runnable processes or if + * another process needs some space to swapin. + */ + if ((swapinreq || ((percentactive > 90) && (runnablenow > 2))) && + (((cnt.v_free_count + cnt.v_inactive_count) <= (cnt.v_free_target + cnt.v_inactive_target)) || + (cnt.v_free_count < cnt.v_free_min))) { + if ((p = outp) == 0) { + p = outp2; + } + + if (p) { + swapout(p); + didswap = 1; + } + } + + /* + * if we previously had found a process to swapout, and we need to swapout + * more then try again. + */ +#if 0 + if( p && swapinreq) + goto swapmore; +#endif + + /* + * If we swapped something out, and another process needed memory, + * then wakeup the sched process. + */ + if (didswap) { + if (swapneeded) + wakeup((caddr_t)&proc0); + swapinreq = 0; + } +} + +void +swapout(p) + register struct proc *p; +{ + vm_offset_t addr; + struct pmap *pmap = &p->p_vmspace->vm_pmap; + vm_map_t map = &p->p_vmspace->vm_map; + vm_offset_t ptaddr; + int i; + + ++p->p_stats->p_ru.ru_nswap; + /* + * remember the process resident count + */ + p->p_vmspace->vm_swrss = + p->p_vmspace->vm_pmap.pm_stats.resident_count; + /* + * and decrement the amount of needed space + */ + swapinreq -= min(swapinreq, p->p_vmspace->vm_pmap.pm_stats.resident_count); + + (void) splhigh(); + p->p_flag &= ~P_INMEM; + if (p->p_stat == SRUN) + remrq(p); + (void) spl0(); + + ++p->p_lock; +/* let the upages be paged */ + pmap_remove(vm_map_pmap(kernel_map), + (vm_offset_t) p->p_addr, ((vm_offset_t) p->p_addr) + UPAGES * NBPG); + + vm_map_pageable(map, (vm_offset_t) kstack, + (vm_offset_t) kstack + UPAGES * NBPG, TRUE); + + --p->p_lock; + p->p_swtime = 0; +} + +/* + * The rest of these routines fake thread handling + */ + +#ifndef assert_wait +void +assert_wait(event, ruptible) + int event; + boolean_t ruptible; +{ +#ifdef lint + ruptible++; +#endif + curproc->p_thread = event; +} +#endif + +void +thread_block(char *msg) +{ + if (curproc->p_thread) + tsleep((caddr_t)curproc->p_thread, PVM, msg, 0); +} + + +void +thread_sleep_(event, lock, wmesg) + int event; + simple_lock_t lock; + char *wmesg; +{ + + curproc->p_thread = event; + simple_unlock(lock); + if (curproc->p_thread) { + tsleep((caddr_t)event, PVM, wmesg, 0); + } +} + +#ifndef thread_wakeup +void +thread_wakeup(event) + int event; +{ + wakeup((caddr_t)event); +} +#endif + +/* + * DEBUG stuff + */ + +int indent = 0; + +#include <machine/stdarg.h> /* see subr_prf.c */ + +/*ARGSUSED2*/ +void +#if __STDC__ +iprintf(const char *fmt, ...) +#else +iprintf(fmt /* , va_alist */) + char *fmt; + /* va_dcl */ +#endif +{ + register int i; + va_list ap; + + for (i = indent; i >= 8; i -= 8) + printf("\t"); + while (--i >= 0) + printf(" "); + va_start(ap, fmt); + printf("%r", fmt, ap); + va_end(ap); +} diff --git a/sys/vm/vm_inherit.h b/sys/vm/vm_inherit.h new file mode 100644 index 0000000..455f91c --- /dev/null +++ b/sys/vm/vm_inherit.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_inherit.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory map inheritance definitions. + */ + +#ifndef _VM_INHERIT_ +#define _VM_INHERIT_ + +/* + * Enumeration of valid values for vm_inherit_t. + */ + +#define VM_INHERIT_SHARE ((vm_inherit_t) 0) /* share with child */ +#define VM_INHERIT_COPY ((vm_inherit_t) 1) /* copy into child */ +#define VM_INHERIT_NONE ((vm_inherit_t) 2) /* absent from child */ +#define VM_INHERIT_DONATE_COPY ((vm_inherit_t) 3) /* copy and delete */ + +#define VM_INHERIT_DEFAULT VM_INHERIT_COPY + +#endif /* _VM_INHERIT_ */ diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c new file mode 100644 index 0000000..a0eac70 --- /dev/null +++ b/sys/vm/vm_init.c @@ -0,0 +1,105 @@ + +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_init.c 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Initialize the Virtual Memory subsystem. + */ + +#include <sys/param.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +/* + * vm_init initializes the virtual memory system. + * This is done only by the first cpu up. + * + * The start and end address of physical memory is passed in. + */ + +void +vm_mem_init() +{ + extern vm_offset_t avail_start, avail_end; + extern vm_offset_t virtual_avail, virtual_end; + + /* + * Initializes resident memory structures. + * From here on, all physical memory is accounted for, + * and we use only virtual addresses. + */ + + vm_set_page_size(); + virtual_avail = vm_page_startup(avail_start, avail_end, virtual_avail); + /* + * Initialize other VM packages + */ + vm_object_init(virtual_end - VM_MIN_KERNEL_ADDRESS); + vm_map_startup(); + kmem_init(virtual_avail, virtual_end); + pmap_init(avail_start, avail_end); + vm_pager_init(); +} diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c new file mode 100644 index 0000000..55a0949 --- /dev/null +++ b/sys/vm/vm_kern.c @@ -0,0 +1,454 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Kernel memory management. + */ + +#include <sys/param.h> +#include <sys/systm.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/vm_kern.h> + +/* + * kmem_alloc_pageable: + * + * Allocate pageable memory to the kernel's address map. + * map must be "kernel_map" below. + */ + +vm_offset_t kmem_alloc_pageable(map, size) + vm_map_t map; + register vm_size_t size; +{ + vm_offset_t addr; + register int result; + +#if 0 + if (map != kernel_map) + panic("kmem_alloc_pageable: not called with kernel_map"); +#endif + + size = round_page(size); + + addr = vm_map_min(map); + result = vm_map_find(map, NULL, (vm_offset_t) 0, + &addr, size, TRUE); + if (result != KERN_SUCCESS) { + return(0); + } + + return(addr); +} + +/* + * Allocate wired-down memory in the kernel's address map + * or a submap. + */ +vm_offset_t kmem_alloc(map, size) + register vm_map_t map; + register vm_size_t size; +{ + vm_offset_t addr; + register vm_offset_t offset; + extern vm_object_t kernel_object; + vm_offset_t i; + + size = round_page(size); + + /* + * Use the kernel object for wired-down kernel pages. + * Assume that no region of the kernel object is + * referenced more than once. + */ + + /* + * Locate sufficient space in the map. This will give us the + * final virtual address for the new memory, and thus will tell + * us the offset within the kernel map. + */ + vm_map_lock(map); + if (vm_map_findspace(map, 0, size, &addr)) { + vm_map_unlock(map); + return (0); + } + offset = addr - VM_MIN_KERNEL_ADDRESS; + vm_object_reference(kernel_object); + vm_map_insert(map, kernel_object, offset, addr, addr + size); + vm_map_unlock(map); + + /* + * Guarantee that there are pages already in this object + * before calling vm_map_pageable. This is to prevent the + * following scenario: + * + * 1) Threads have swapped out, so that there is a + * pager for the kernel_object. + * 2) The kmsg zone is empty, and so we are kmem_allocing + * a new page for it. + * 3) vm_map_pageable calls vm_fault; there is no page, + * but there is a pager, so we call + * pager_data_request. But the kmsg zone is empty, + * so we must kmem_alloc. + * 4) goto 1 + * 5) Even if the kmsg zone is not empty: when we get + * the data back from the pager, it will be (very + * stale) non-zero data. kmem_alloc is defined to + * return zero-filled memory. + * + * We're intentionally not activating the pages we allocate + * to prevent a race with page-out. vm_map_pageable will wire + * the pages. + */ + + vm_object_lock(kernel_object); + for (i = 0 ; i < size; i+= PAGE_SIZE) { + vm_page_t mem; + + while ((mem = vm_page_alloc(kernel_object, offset+i)) == NULL) { + vm_object_unlock(kernel_object); + VM_WAIT; + vm_object_lock(kernel_object); + } + vm_page_zero_fill(mem); + mem->flags &= ~PG_BUSY; + } + vm_object_unlock(kernel_object); + + /* + * And finally, mark the data as non-pageable. + */ + + (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE); + + /* + * Try to coalesce the map + */ + + vm_map_simplify(map, addr); + + return(addr); +} + +/* + * kmem_free: + * + * Release a region of kernel virtual memory allocated + * with kmem_alloc, and return the physical pages + * associated with that region. + */ +void kmem_free(map, addr, size) + vm_map_t map; + register vm_offset_t addr; + vm_size_t size; +{ + (void) vm_map_remove(map, trunc_page(addr), round_page(addr + size)); +} + +/* + * kmem_suballoc: + * + * Allocates a map to manage a subrange + * of the kernel virtual address space. + * + * Arguments are as follows: + * + * parent Map to take range from + * size Size of range to find + * min, max Returned endpoints of map + * pageable Can the region be paged + */ +vm_map_t kmem_suballoc(parent, min, max, size, pageable) + register vm_map_t parent; + vm_offset_t *min, *max; + register vm_size_t size; + boolean_t pageable; +{ + register int ret; + vm_map_t result; + + size = round_page(size); + + *min = (vm_offset_t) vm_map_min(parent); + ret = vm_map_find(parent, NULL, (vm_offset_t) 0, + min, size, TRUE); + if (ret != KERN_SUCCESS) { + printf("kmem_suballoc: bad status return of %d.\n", ret); + panic("kmem_suballoc"); + } + *max = *min + size; + pmap_reference(vm_map_pmap(parent)); + result = vm_map_create(vm_map_pmap(parent), *min, *max, pageable); + if (result == NULL) + panic("kmem_suballoc: cannot create submap"); + if ((ret = vm_map_submap(parent, *min, *max, result)) != KERN_SUCCESS) + panic("kmem_suballoc: unable to change range to submap"); + return(result); +} + +/* + * Allocate wired-down memory in the kernel's address map for the higher + * level kernel memory allocator (kern/kern_malloc.c). We cannot use + * kmem_alloc() because we may need to allocate memory at interrupt + * level where we cannot block (canwait == FALSE). + * + * This routine has its own private kernel submap (kmem_map) and object + * (kmem_object). This, combined with the fact that only malloc uses + * this routine, ensures that we will never block in map or object waits. + * + * Note that this still only works in a uni-processor environment and + * when called at splhigh(). + * + * We don't worry about expanding the map (adding entries) since entries + * for wired maps are statically allocated. + */ +vm_offset_t +kmem_malloc(map, size, canwait) + register vm_map_t map; + register vm_size_t size; + boolean_t canwait; +{ + register vm_offset_t offset, i; + vm_map_entry_t entry; + vm_offset_t addr; + vm_page_t m; + extern vm_object_t kmem_object; + + if (map != kmem_map && map != mb_map) + panic("kern_malloc_alloc: map != {kmem,mb}_map"); + + size = round_page(size); + addr = vm_map_min(map); + + /* + * Locate sufficient space in the map. This will give us the + * final virtual address for the new memory, and thus will tell + * us the offset within the kernel map. + */ + vm_map_lock(map); + if (vm_map_findspace(map, 0, size, &addr)) { + vm_map_unlock(map); +#if 0 + if (canwait) /* XXX should wait */ + panic("kmem_malloc: %s too small", + map == kmem_map ? "kmem_map" : "mb_map"); +#endif + if (canwait) + panic("kmem_malloc: map too small"); + return (0); + } + offset = addr - vm_map_min(kmem_map); + vm_object_reference(kmem_object); + vm_map_insert(map, kmem_object, offset, addr, addr + size); + + /* + * If we can wait, just mark the range as wired + * (will fault pages as necessary). + */ + if (canwait) { + vm_map_unlock(map); + (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, + FALSE); + vm_map_simplify(map, addr); + return(addr); + } + + /* + * If we cannot wait then we must allocate all memory up front, + * pulling it off the active queue to prevent pageout. + */ + vm_object_lock(kmem_object); + for (i = 0; i < size; i += PAGE_SIZE) { + m = vm_page_alloc(kmem_object, offset + i); + + /* + * Ran out of space, free everything up and return. + * Don't need to lock page queues here as we know + * that the pages we got aren't on any queues. + */ + if (m == NULL) { + while (i != 0) { + i -= PAGE_SIZE; + m = vm_page_lookup(kmem_object, offset + i); + vm_page_free(m); + } + vm_object_unlock(kmem_object); + vm_map_delete(map, addr, addr + size); + vm_map_unlock(map); + return(0); + } +#if 0 + vm_page_zero_fill(m); +#endif + m->flags &= ~PG_BUSY; + } + vm_object_unlock(kmem_object); + + /* + * Mark map entry as non-pageable. + * Assert: vm_map_insert() will never be able to extend the previous + * entry so there will be a new entry exactly corresponding to this + * address range and it will have wired_count == 0. + */ + if (!vm_map_lookup_entry(map, addr, &entry) || + entry->start != addr || entry->end != addr + size || + entry->wired_count) + panic("kmem_malloc: entry not found or misaligned"); + entry->wired_count++; + + /* + * Loop thru pages, entering them in the pmap. + * (We cannot add them to the wired count without + * wrapping the vm_page_queue_lock in splimp...) + */ + for (i = 0; i < size; i += PAGE_SIZE) { + vm_object_lock(kmem_object); + m = vm_page_lookup(kmem_object, offset + i); + vm_object_unlock(kmem_object); + pmap_enter(map->pmap, addr + i, VM_PAGE_TO_PHYS(m), + VM_PROT_DEFAULT, TRUE); + } + vm_map_unlock(map); + + vm_map_simplify(map, addr); + return(addr); +} + +/* + * kmem_alloc_wait + * + * Allocates pageable memory from a sub-map of the kernel. If the submap + * has no room, the caller sleeps waiting for more memory in the submap. + * + */ +vm_offset_t kmem_alloc_wait(map, size) + vm_map_t map; + vm_size_t size; +{ + vm_offset_t addr; + + size = round_page(size); + + for (;;) { + /* + * To make this work for more than one map, + * use the map's lock to lock out sleepers/wakers. + */ + vm_map_lock(map); + if (vm_map_findspace(map, 0, size, &addr) == 0) + break; + /* no space now; see if we can ever get space */ + if (vm_map_max(map) - vm_map_min(map) < size) { + vm_map_unlock(map); + return (0); + } + assert_wait((int)map, TRUE); + vm_map_unlock(map); + thread_block("kmaw"); + } + vm_map_insert(map, NULL, (vm_offset_t)0, addr, addr + size); + vm_map_unlock(map); + return (addr); +} + +/* + * kmem_free_wakeup + * + * Returns memory to a submap of the kernel, and wakes up any threads + * waiting for memory in that map. + */ +void kmem_free_wakeup(map, addr, size) + vm_map_t map; + vm_offset_t addr; + vm_size_t size; +{ + vm_map_lock(map); + (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); + thread_wakeup((int)map); + vm_map_unlock(map); +} + +/* + * Create the kernel map; insert a mapping covering kernel text, data, bss, + * and all space allocated thus far (`boostrap' data). The new map will thus + * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and + * the range between `start' and `end' as free. + */ +void kmem_init(start, end) + vm_offset_t start, end; +{ + register vm_map_t m; + + m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end, FALSE); + vm_map_lock(m); + /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ + kernel_map = m; + (void) vm_map_insert(m, NULL, (vm_offset_t)0, + VM_MIN_KERNEL_ADDRESS, start); + /* ... and ending with the completion of the above `insert' */ + vm_map_unlock(m); +} diff --git a/sys/vm/vm_kern.h b/sys/vm/vm_kern.h new file mode 100644 index 0000000..c032560 --- /dev/null +++ b/sys/vm/vm_kern.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_kern.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* Kernel memory management definitions. */ + +vm_map_t buffer_map; +vm_map_t kernel_map; +vm_map_t kmem_map; +vm_map_t mb_map; +vm_map_t io_map; +vm_map_t clean_map; +vm_map_t pager_map; +vm_map_t phys_map; diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c new file mode 100644 index 0000000..ffffa96 --- /dev/null +++ b/sys/vm/vm_map.c @@ -0,0 +1,2681 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_map.c 8.3 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory mapping module. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> + +/* + * Virtual memory maps provide for the mapping, protection, + * and sharing of virtual memory objects. In addition, + * this module provides for an efficient virtual copy of + * memory from one map to another. + * + * Synchronization is required prior to most operations. + * + * Maps consist of an ordered doubly-linked list of simple + * entries; a single hint is used to speed up lookups. + * + * In order to properly represent the sharing of virtual + * memory regions among maps, the map structure is bi-level. + * Top-level ("address") maps refer to regions of sharable + * virtual memory. These regions are implemented as + * ("sharing") maps, which then refer to the actual virtual + * memory objects. When two address maps "share" memory, + * their top-level maps both have references to the same + * sharing map. When memory is virtual-copied from one + * address map to another, the references in the sharing + * maps are actually copied -- no copying occurs at the + * virtual memory object level. + * + * Since portions of maps are specified by start/end addreses, + * which may not align with existing map entries, all + * routines merely "clip" entries to these start/end values. + * [That is, an entry is split into two, bordering at a + * start or end value.] Note that these clippings may not + * always be necessary (as the two resulting entries are then + * not changed); however, the clipping is done for convenience. + * No attempt is currently made to "glue back together" two + * abutting entries. + * + * As mentioned above, virtual copy operations are performed + * by copying VM object references from one sharing map to + * another, and then marking both regions as copy-on-write. + * It is important to note that only one writeable reference + * to a VM object region exists in any map -- this means that + * shadow object creation can be delayed until a write operation + * occurs. + */ + +/* + * vm_map_startup: + * + * Initialize the vm_map module. Must be called before + * any other vm_map routines. + * + * Map and entry structures are allocated from the general + * purpose memory pool with some exceptions: + * + * - The kernel map and kmem submap are allocated statically. + * - Kernel map entries are allocated out of a static pool. + * + * These restrictions are necessary since malloc() uses the + * maps and requires map entries. + */ + +vm_offset_t kentry_data; +vm_size_t kentry_data_size; +vm_map_entry_t kentry_free; +vm_map_t kmap_free; + +int kentry_count; +vm_map_t kmap_free; +static vm_offset_t mapvm=0; +static int mapvmpgcnt=0; + +static void _vm_map_clip_end __P((vm_map_t, vm_map_entry_t, vm_offset_t)); +static void _vm_map_clip_start __P((vm_map_t, vm_map_entry_t, vm_offset_t)); + +void vm_map_startup() +{ + register int i; + register vm_map_entry_t mep; + vm_map_t mp; + + /* + * Static map structures for allocation before initialization of + * kernel map or kmem map. vm_map_create knows how to deal with them. + */ + kmap_free = mp = (vm_map_t) kentry_data; + i = MAX_KMAP; + while (--i > 0) { + mp->header.next = (vm_map_entry_t) (mp + 1); + mp++; + } + mp++->header.next = NULL; + + /* + * Form a free list of statically allocated kernel map entries + * with the rest. + */ + kentry_free = mep = (vm_map_entry_t) mp; + i = (kentry_data_size - MAX_KMAP * sizeof *mp) / sizeof *mep; + while (--i > 0) { + mep->next = mep + 1; + mep++; + } + mep->next = NULL; +} + +/* + * Allocate a vmspace structure, including a vm_map and pmap, + * and initialize those structures. The refcnt is set to 1. + * The remaining fields must be initialized by the caller. + */ +struct vmspace * +vmspace_alloc(min, max, pageable) + vm_offset_t min, max; + int pageable; +{ + register struct vmspace *vm; + + MALLOC(vm, struct vmspace *, sizeof(struct vmspace), M_VMMAP, M_WAITOK); + bzero(vm, (caddr_t) &vm->vm_startcopy - (caddr_t) vm); + vm_map_init(&vm->vm_map, min, max, pageable); + pmap_pinit(&vm->vm_pmap); + vm->vm_map.pmap = &vm->vm_pmap; /* XXX */ + vm->vm_refcnt = 1; + return (vm); +} + +void +vmspace_free(vm) + register struct vmspace *vm; +{ + + if (--vm->vm_refcnt == 0) { + /* + * Lock the map, to wait out all other references to it. + * Delete all of the mappings and pages they hold, + * then call the pmap module to reclaim anything left. + */ + vm_map_lock(&vm->vm_map); + (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset, + vm->vm_map.max_offset); + pmap_release(&vm->vm_pmap); + FREE(vm, M_VMMAP); + } +} + +/* + * vm_map_create: + * + * Creates and returns a new empty VM map with + * the given physical map structure, and having + * the given lower and upper address bounds. + */ +vm_map_t vm_map_create(pmap, min, max, pageable) + pmap_t pmap; + vm_offset_t min, max; + boolean_t pageable; +{ + register vm_map_t result; + extern vm_map_t kmem_map; + + if (kmem_map == NULL) { + result = kmap_free; + kmap_free = (vm_map_t) result->header.next; + if (result == NULL) + panic("vm_map_create: out of maps"); + } else + MALLOC(result, vm_map_t, sizeof(struct vm_map), + M_VMMAP, M_WAITOK); + + vm_map_init(result, min, max, pageable); + result->pmap = pmap; + return(result); +} + +/* + * Initialize an existing vm_map structure + * such as that in the vmspace structure. + * The pmap is set elsewhere. + */ +void +vm_map_init(map, min, max, pageable) + register struct vm_map *map; + vm_offset_t min, max; + boolean_t pageable; +{ + map->header.next = map->header.prev = &map->header; + map->nentries = 0; + map->size = 0; + map->ref_count = 1; + map->is_main_map = TRUE; + map->min_offset = min; + map->max_offset = max; + map->entries_pageable = pageable; + map->first_free = &map->header; + map->hint = &map->header; + map->timestamp = 0; + lock_init(&map->lock, TRUE); + simple_lock_init(&map->ref_lock); + simple_lock_init(&map->hint_lock); +} + +/* + * vm_map_entry_create: [ internal use only ] + * + * Allocates a VM map entry for insertion. + * No entry fields are filled in. This routine is + */ +static struct vm_map_entry *mappool; +static int mappoolcnt; +void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); + +vm_map_entry_t +vm_map_entry_create(map) + vm_map_t map; +{ + vm_map_entry_t entry; + int s; + int i; +#define KENTRY_LOW_WATER 64 +#define MAPENTRY_LOW_WATER 64 + + /* + * This is a *very* nasty (and sort of incomplete) hack!!!! + */ + if (kentry_count < KENTRY_LOW_WATER) { + if (mapvmpgcnt && mapvm) { + vm_page_t m; + if (m = vm_page_alloc(kmem_object, mapvm-vm_map_min(kmem_map))) { + int newentries; + newentries = (NBPG/sizeof (struct vm_map_entry)); + vm_page_wire(m); + m->flags &= ~PG_BUSY; + pmap_enter(vm_map_pmap(kmem_map), mapvm, + VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT, 1); + + entry = (vm_map_entry_t) mapvm; + mapvm += NBPG; + --mapvmpgcnt; + + for (i = 0; i < newentries; i++) { + vm_map_entry_dispose(kernel_map, entry); + entry++; + } + } + } + } + + if (map == kernel_map || map == kmem_map || map == pager_map) { + + if (entry = kentry_free) { + kentry_free = entry->next; + --kentry_count; + return entry; + } + + if (entry = mappool) { + mappool = entry->next; + --mappoolcnt; + return entry; + } + + } else { + if (entry = mappool) { + mappool = entry->next; + --mappoolcnt; + return entry; + } + + MALLOC(entry, vm_map_entry_t, sizeof(struct vm_map_entry), + M_VMMAPENT, M_WAITOK); + } +dopanic: + if (entry == NULL) + panic("vm_map_entry_create: out of map entries"); + + return(entry); +} + +/* + * vm_map_entry_dispose: [ internal use only ] + * + * Inverse of vm_map_entry_create. + */ +void +vm_map_entry_dispose(map, entry) + vm_map_t map; + vm_map_entry_t entry; +{ + extern vm_map_t kernel_map, kmem_map, pager_map; + int s; + + if (map == kernel_map || map == kmem_map || map == pager_map || + kentry_count < KENTRY_LOW_WATER) { + entry->next = kentry_free; + kentry_free = entry; + ++kentry_count; + } else { + if (mappoolcnt < MAPENTRY_LOW_WATER) { + entry->next = mappool; + mappool = entry; + ++mappoolcnt; + return; + } + + FREE(entry, M_VMMAPENT); + } +} + +/* + * vm_map_entry_{un,}link: + * + * Insert/remove entries from maps. + */ +#define vm_map_entry_link(map, after_where, entry) \ + { \ + (map)->nentries++; \ + (entry)->prev = (after_where); \ + (entry)->next = (after_where)->next; \ + (entry)->prev->next = (entry); \ + (entry)->next->prev = (entry); \ + } +#define vm_map_entry_unlink(map, entry) \ + { \ + (map)->nentries--; \ + (entry)->next->prev = (entry)->prev; \ + (entry)->prev->next = (entry)->next; \ + } + +/* + * vm_map_reference: + * + * Creates another valid reference to the given map. + * + */ +void vm_map_reference(map) + register vm_map_t map; +{ + if (map == NULL) + return; + + simple_lock(&map->ref_lock); + map->ref_count++; + simple_unlock(&map->ref_lock); +} + +/* + * vm_map_deallocate: + * + * Removes a reference from the specified map, + * destroying it if no references remain. + * The map should not be locked. + */ +void vm_map_deallocate(map) + register vm_map_t map; +{ + register int c; + + if (map == NULL) + return; + + simple_lock(&map->ref_lock); + c = --map->ref_count; + simple_unlock(&map->ref_lock); + + if (c > 0) { + return; + } + + /* + * Lock the map, to wait out all other references + * to it. + */ + + vm_map_lock(map); + + (void) vm_map_delete(map, map->min_offset, map->max_offset); + + pmap_destroy(map->pmap); + + FREE(map, M_VMMAP); +} + +/* + * vm_map_insert: + * + * Inserts the given whole VM object into the target + * map at the specified address range. The object's + * size should match that of the address range. + * + * Requires that the map be locked, and leaves it so. + */ +int +vm_map_insert(map, object, offset, start, end) + vm_map_t map; + vm_object_t object; + vm_offset_t offset; + vm_offset_t start; + vm_offset_t end; +{ + register vm_map_entry_t new_entry; + register vm_map_entry_t prev_entry; + vm_map_entry_t temp_entry; + + /* + * Check that the start and end points are not bogus. + */ + + if ((start < map->min_offset) || (end > map->max_offset) || + (start >= end)) + return(KERN_INVALID_ADDRESS); + + /* + * Find the entry prior to the proposed + * starting address; if it's part of an + * existing entry, this range is bogus. + */ + + if (vm_map_lookup_entry(map, start, &temp_entry)) + return(KERN_NO_SPACE); + + prev_entry = temp_entry; + + /* + * Assert that the next entry doesn't overlap the + * end point. + */ + + if ((prev_entry->next != &map->header) && + (prev_entry->next->start < end)) + return(KERN_NO_SPACE); + + /* + * See if we can avoid creating a new entry by + * extending one of our neighbors. + */ + + if (object == NULL) { + if ((prev_entry != &map->header) && + (prev_entry->end == start) && + (map->is_main_map) && + (prev_entry->is_a_map == FALSE) && + (prev_entry->is_sub_map == FALSE) && + (prev_entry->inheritance == VM_INHERIT_DEFAULT) && + (prev_entry->protection == VM_PROT_DEFAULT) && + (prev_entry->max_protection == VM_PROT_DEFAULT) && + (prev_entry->wired_count == 0)) { + + if (vm_object_coalesce(prev_entry->object.vm_object, + NULL, + prev_entry->offset, + (vm_offset_t) 0, + (vm_size_t)(prev_entry->end + - prev_entry->start), + (vm_size_t)(end - prev_entry->end))) { + /* + * Coalesced the two objects - can extend + * the previous map entry to include the + * new range. + */ + map->size += (end - prev_entry->end); + prev_entry->end = end; + return(KERN_SUCCESS); + } + } + } + + /* + * Create a new entry + */ + + new_entry = vm_map_entry_create(map); + new_entry->start = start; + new_entry->end = end; + + new_entry->is_a_map = FALSE; + new_entry->is_sub_map = FALSE; + new_entry->object.vm_object = object; + new_entry->offset = offset; + + new_entry->copy_on_write = FALSE; + new_entry->needs_copy = FALSE; + + if (map->is_main_map) { + new_entry->inheritance = VM_INHERIT_DEFAULT; + new_entry->protection = VM_PROT_DEFAULT; + new_entry->max_protection = VM_PROT_DEFAULT; + new_entry->wired_count = 0; + } + + /* + * Insert the new entry into the list + */ + + vm_map_entry_link(map, prev_entry, new_entry); + map->size += new_entry->end - new_entry->start; + + /* + * Update the free space hint + */ + + if ((map->first_free == prev_entry) && (prev_entry->end >= new_entry->start)) + map->first_free = new_entry; + + return(KERN_SUCCESS); +} + +/* + * SAVE_HINT: + * + * Saves the specified entry as the hint for + * future lookups. Performs necessary interlocks. + */ +#define SAVE_HINT(map,value) \ + simple_lock(&(map)->hint_lock); \ + (map)->hint = (value); \ + simple_unlock(&(map)->hint_lock); + +/* + * vm_map_lookup_entry: [ internal use only ] + * + * Finds the map entry containing (or + * immediately preceding) the specified address + * in the given map; the entry is returned + * in the "entry" parameter. The boolean + * result indicates whether the address is + * actually contained in the map. + */ +boolean_t vm_map_lookup_entry(map, address, entry) + register vm_map_t map; + register vm_offset_t address; + vm_map_entry_t *entry; /* OUT */ +{ + register vm_map_entry_t cur; + register vm_map_entry_t last; + + /* + * Start looking either from the head of the + * list, or from the hint. + */ + + simple_lock(&map->hint_lock); + cur = map->hint; + simple_unlock(&map->hint_lock); + + if (cur == &map->header) + cur = cur->next; + + if (address >= cur->start) { + /* + * Go from hint to end of list. + * + * But first, make a quick check to see if + * we are already looking at the entry we + * want (which is usually the case). + * Note also that we don't need to save the hint + * here... it is the same hint (unless we are + * at the header, in which case the hint didn't + * buy us anything anyway). + */ + last = &map->header; + if ((cur != last) && (cur->end > address)) { + *entry = cur; + return(TRUE); + } + } + else { + /* + * Go from start to hint, *inclusively* + */ + last = cur->next; + cur = map->header.next; + } + + /* + * Search linearly + */ + + while (cur != last) { + if (cur->end > address) { + if (address >= cur->start) { + /* + * Save this lookup for future + * hints, and return + */ + + *entry = cur; + SAVE_HINT(map, cur); + return(TRUE); + } + break; + } + cur = cur->next; + } + *entry = cur->prev; + SAVE_HINT(map, *entry); + return(FALSE); +} + +/* + * Find sufficient space for `length' bytes in the given map, starting at + * `start'. The map must be locked. Returns 0 on success, 1 on no space. + */ +int +vm_map_findspace(map, start, length, addr) + register vm_map_t map; + register vm_offset_t start; + vm_size_t length; + vm_offset_t *addr; +{ + register vm_map_entry_t entry, next; + register vm_offset_t end; + + if (start < map->min_offset) + start = map->min_offset; + if (start > map->max_offset) + return (1); + + /* + * Look for the first possible address; if there's already + * something at this address, we have to start after it. + */ + if (start == map->min_offset) { + if ((entry = map->first_free) != &map->header) + start = entry->end; + } else { + vm_map_entry_t tmp; + if (vm_map_lookup_entry(map, start, &tmp)) + start = tmp->end; + entry = tmp; + } + + /* + * Look through the rest of the map, trying to fit a new region in + * the gap between existing regions, or after the very last region. + */ + for (;; start = (entry = next)->end) { + /* + * Find the end of the proposed new region. Be sure we didn't + * go beyond the end of the map, or wrap around the address; + * if so, we lose. Otherwise, if this is the last entry, or + * if the proposed new region fits before the next entry, we + * win. + */ + end = start + length; + if (end > map->max_offset || end < start) + return (1); + next = entry->next; + if (next == &map->header || next->start >= end) + break; + } + SAVE_HINT(map, entry); + *addr = start; + return (0); +} + +/* + * vm_map_find finds an unallocated region in the target address + * map with the given length. The search is defined to be + * first-fit from the specified address; the region found is + * returned in the same parameter. + * + */ +int +vm_map_find(map, object, offset, addr, length, find_space) + vm_map_t map; + vm_object_t object; + vm_offset_t offset; + vm_offset_t *addr; /* IN/OUT */ + vm_size_t length; + boolean_t find_space; +{ + register vm_offset_t start; + int result; + + start = *addr; + vm_map_lock(map); + if (find_space) { + if (vm_map_findspace(map, start, length, addr)) { + vm_map_unlock(map); + return (KERN_NO_SPACE); + } + start = *addr; + } + result = vm_map_insert(map, object, offset, start, start + length); + vm_map_unlock(map); + return (result); +} + +/* + * vm_map_simplify_entry: [ internal use only ] + * + * Simplify the given map entry by: + * removing extra sharing maps + * [XXX maybe later] merging with a neighbor + */ +void vm_map_simplify_entry(map, entry) + vm_map_t map; + vm_map_entry_t entry; +{ +#ifdef lint + map++; +#endif + + /* + * If this entry corresponds to a sharing map, then + * see if we can remove the level of indirection. + * If it's not a sharing map, then it points to + * a VM object, so see if we can merge with either + * of our neighbors. + */ + + if (entry->is_sub_map) + return; + if (entry->is_a_map) { +#if 0 + vm_map_t my_share_map; + int count; + + my_share_map = entry->object.share_map; + simple_lock(&my_share_map->ref_lock); + count = my_share_map->ref_count; + simple_unlock(&my_share_map->ref_lock); + + if (count == 1) { + /* Can move the region from + * entry->start to entry->end (+ entry->offset) + * in my_share_map into place of entry. + * Later. + */ + } +#endif + } + else { + /* + * Try to merge with our neighbors. + * + * Conditions for merge are: + * + * 1. entries are adjacent. + * 2. both entries point to objects + * with null pagers. + * + * If a merge is possible, we replace the two + * entries with a single entry, then merge + * the two objects into a single object. + * + * Now, all that is left to do is write the + * code! + */ + } +} + +/* + * vm_map_clip_start: [ internal use only ] + * + * Asserts that the given entry begins at or after + * the specified address; if necessary, + * it splits the entry into two. + */ +#define vm_map_clip_start(map, entry, startaddr) \ +{ \ + if (startaddr > entry->start) \ + _vm_map_clip_start(map, entry, startaddr); \ +} + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +static void _vm_map_clip_start(map, entry, start) + register vm_map_t map; + register vm_map_entry_t entry; + register vm_offset_t start; +{ + register vm_map_entry_t new_entry; + + /* + * See if we can simplify this entry first + */ + + /* vm_map_simplify_entry(map, entry); */ + + /* + * Split off the front portion -- + * note that we must insert the new + * entry BEFORE this one, so that + * this entry has the specified starting + * address. + */ + + new_entry = vm_map_entry_create(map); + *new_entry = *entry; + + new_entry->end = start; + entry->offset += (start - entry->start); + entry->start = start; + + vm_map_entry_link(map, entry->prev, new_entry); + + if (entry->is_a_map || entry->is_sub_map) + vm_map_reference(new_entry->object.share_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * vm_map_clip_end: [ internal use only ] + * + * Asserts that the given entry ends at or before + * the specified address; if necessary, + * it splits the entry into two. + */ + +#define vm_map_clip_end(map, entry, endaddr) \ +{ \ + if (endaddr < entry->end) \ + _vm_map_clip_end(map, entry, endaddr); \ +} + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +static void _vm_map_clip_end(map, entry, end) + register vm_map_t map; + register vm_map_entry_t entry; + register vm_offset_t end; +{ + register vm_map_entry_t new_entry; + + /* + * Create a new entry and insert it + * AFTER the specified entry + */ + + new_entry = vm_map_entry_create(map); + *new_entry = *entry; + + new_entry->start = entry->end = end; + new_entry->offset += (end - entry->start); + + vm_map_entry_link(map, entry, new_entry); + + if (entry->is_a_map || entry->is_sub_map) + vm_map_reference(new_entry->object.share_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * VM_MAP_RANGE_CHECK: [ internal use only ] + * + * Asserts that the starting and ending region + * addresses fall within the valid range of the map. + */ +#define VM_MAP_RANGE_CHECK(map, start, end) \ + { \ + if (start < vm_map_min(map)) \ + start = vm_map_min(map); \ + if (end > vm_map_max(map)) \ + end = vm_map_max(map); \ + if (start > end) \ + start = end; \ + } + +/* + * vm_map_submap: [ kernel use only ] + * + * Mark the given range as handled by a subordinate map. + * + * This range must have been created with vm_map_find, + * and no other operations may have been performed on this + * range prior to calling vm_map_submap. + * + * Only a limited number of operations can be performed + * within this rage after calling vm_map_submap: + * vm_fault + * [Don't try vm_map_copy!] + * + * To remove a submapping, one must first remove the + * range from the superior map, and then destroy the + * submap (if desired). [Better yet, don't try it.] + */ +int +vm_map_submap(map, start, end, submap) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + vm_map_t submap; +{ + vm_map_entry_t entry; + register int result = KERN_INVALID_ARGUMENT; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->next; + + vm_map_clip_end(map, entry, end); + + if ((entry->start == start) && (entry->end == end) && + (!entry->is_a_map) && + (entry->object.vm_object == NULL) && + (!entry->copy_on_write)) { + entry->is_a_map = FALSE; + entry->is_sub_map = TRUE; + vm_map_reference(entry->object.sub_map = submap); + result = KERN_SUCCESS; + } + vm_map_unlock(map); + + return(result); +} + +/* + * vm_map_protect: + * + * Sets the protection of the specified address + * region in the target map. If "set_max" is + * specified, the maximum protection is to be set; + * otherwise, only the current protection is affected. + */ +int +vm_map_protect(map, start, end, new_prot, set_max) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_prot_t new_prot; + register boolean_t set_max; +{ + register vm_map_entry_t current; + vm_map_entry_t entry; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->next; + + /* + * Make a first pass to check for protection + * violations. + */ + + current = entry; + while ((current != &map->header) && (current->start < end)) { + if (current->is_sub_map) + return(KERN_INVALID_ARGUMENT); + if ((new_prot & current->max_protection) != new_prot) { + vm_map_unlock(map); + return(KERN_PROTECTION_FAILURE); + } + + current = current->next; + } + + /* + * Go back and fix up protections. + * [Note that clipping is not necessary the second time.] + */ + + current = entry; + + while ((current != &map->header) && (current->start < end)) { + vm_prot_t old_prot; + + vm_map_clip_end(map, current, end); + + old_prot = current->protection; + if (set_max) + current->protection = + (current->max_protection = new_prot) & + old_prot; + else + current->protection = new_prot; + + /* + * Update physical map if necessary. + * Worry about copy-on-write here -- CHECK THIS XXX + */ + + if (current->protection != old_prot) { + +#define MASK(entry) ((entry)->copy_on_write ? ~VM_PROT_WRITE : \ + VM_PROT_ALL) +#define max(a,b) ((a) > (b) ? (a) : (b)) + + if (current->is_a_map) { + vm_map_entry_t share_entry; + vm_offset_t share_end; + + vm_map_lock(current->object.share_map); + (void) vm_map_lookup_entry( + current->object.share_map, + current->offset, + &share_entry); + share_end = current->offset + + (current->end - current->start); + while ((share_entry != + ¤t->object.share_map->header) && + (share_entry->start < share_end)) { + + pmap_protect(map->pmap, + (max(share_entry->start, + current->offset) - + current->offset + + current->start), + min(share_entry->end, + share_end) - + current->offset + + current->start, + current->protection & + MASK(share_entry)); + + share_entry = share_entry->next; + } + vm_map_unlock(current->object.share_map); + } + else + pmap_protect(map->pmap, current->start, + current->end, + current->protection & MASK(entry)); +#undef max +#undef MASK + } + current = current->next; + } + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_inherit: + * + * Sets the inheritance of the specified address + * range in the target map. Inheritance + * affects how the map will be shared with + * child maps at the time of vm_map_fork. + */ +int +vm_map_inherit(map, start, end, new_inheritance) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_inherit_t new_inheritance; +{ + register vm_map_entry_t entry; + vm_map_entry_t temp_entry; + + switch (new_inheritance) { + case VM_INHERIT_NONE: + case VM_INHERIT_COPY: + case VM_INHERIT_SHARE: + break; + default: + return(KERN_INVALID_ARGUMENT); + } + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &temp_entry)) { + entry = temp_entry; + vm_map_clip_start(map, entry, start); + } + else + entry = temp_entry->next; + + while ((entry != &map->header) && (entry->start < end)) { + vm_map_clip_end(map, entry, end); + + entry->inheritance = new_inheritance; + + entry = entry->next; + } + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_pageable: + * + * Sets the pageability of the specified address + * range in the target map. Regions specified + * as not pageable require locked-down physical + * memory and physical page maps. + * + * The map must not be locked, but a reference + * must remain to the map throughout the call. + */ +int +vm_map_pageable(map, start, end, new_pageable) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register boolean_t new_pageable; +{ + register vm_map_entry_t entry; + vm_map_entry_t start_entry; + register vm_offset_t failed = 0; + int rv; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + /* + * Only one pageability change may take place at one + * time, since vm_fault assumes it will be called + * only once for each wiring/unwiring. Therefore, we + * have to make sure we're actually changing the pageability + * for the entire region. We do so before making any changes. + */ + + if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) { + vm_map_unlock(map); + return(KERN_INVALID_ADDRESS); + } + entry = start_entry; + + /* + * Actions are rather different for wiring and unwiring, + * so we have two separate cases. + */ + + if (new_pageable) { + + vm_map_clip_start(map, entry, start); + + /* + * Unwiring. First ensure that the range to be + * unwired is really wired down and that there + * are no holes. + */ + while ((entry != &map->header) && (entry->start < end)) { + + if (entry->wired_count == 0 || + (entry->end < end && + (entry->next == &map->header || + entry->next->start > entry->end))) { + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + entry = entry->next; + } + + /* + * Now decrement the wiring count for each region. + * If a region becomes completely unwired, + * unwire its physical pages and mappings. + */ + lock_set_recursive(&map->lock); + + entry = start_entry; + while ((entry != &map->header) && (entry->start < end)) { + vm_map_clip_end(map, entry, end); + + entry->wired_count--; + if (entry->wired_count == 0) + vm_fault_unwire(map, entry->start, entry->end); + + entry = entry->next; + } + lock_clear_recursive(&map->lock); + } + + else { + /* + * Wiring. We must do this in two passes: + * + * 1. Holding the write lock, we create any shadow + * or zero-fill objects that need to be created. + * Then we clip each map entry to the region to be + * wired and increment its wiring count. We + * create objects before clipping the map entries + * to avoid object proliferation. + * + * 2. We downgrade to a read lock, and call + * vm_fault_wire to fault in the pages for any + * newly wired area (wired_count is 1). + * + * Downgrading to a read lock for vm_fault_wire avoids + * a possible deadlock with another thread that may have + * faulted on one of the pages to be wired (it would mark + * the page busy, blocking us, then in turn block on the + * map lock that we hold). Because of problems in the + * recursive lock package, we cannot upgrade to a write + * lock in vm_map_lookup. Thus, any actions that require + * the write lock must be done beforehand. Because we + * keep the read lock on the map, the copy-on-write status + * of the entries we modify here cannot change. + */ + + /* + * Pass 1. + */ + while ((entry != &map->header) && (entry->start < end)) { + if (entry->wired_count == 0) { + + /* + * Perform actions of vm_map_lookup that need + * the write lock on the map: create a shadow + * object for a copy-on-write region, or an + * object for a zero-fill region. + * + * We don't have to do this for entries that + * point to sharing maps, because we won't hold + * the lock on the sharing map. + */ + if (!entry->is_a_map) { + if (entry->needs_copy && + ((entry->protection & VM_PROT_WRITE) != 0)) { + + vm_object_shadow(&entry->object.vm_object, + &entry->offset, + (vm_size_t)(entry->end + - entry->start)); + entry->needs_copy = FALSE; + } + else if (entry->object.vm_object == NULL) { + entry->object.vm_object = + vm_object_allocate((vm_size_t)(entry->end + - entry->start)); + entry->offset = (vm_offset_t)0; + } + } + } + vm_map_clip_start(map, entry, start); + vm_map_clip_end(map, entry, end); + entry->wired_count++; + + /* + * Check for holes + */ + if (entry->end < end && + (entry->next == &map->header || + entry->next->start > entry->end)) { + /* + * Found one. Object creation actions + * do not need to be undone, but the + * wired counts need to be restored. + */ + while (entry != &map->header && entry->end > start) { + entry->wired_count--; + entry = entry->prev; + } + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + entry = entry->next; + } + + /* + * Pass 2. + */ + + /* + * HACK HACK HACK HACK + * + * If we are wiring in the kernel map or a submap of it, + * unlock the map to avoid deadlocks. We trust that the + * kernel threads are well-behaved, and therefore will + * not do anything destructive to this region of the map + * while we have it unlocked. We cannot trust user threads + * to do the same. + * + * HACK HACK HACK HACK + */ + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_unlock(map); /* trust me ... */ + } + else { + lock_set_recursive(&map->lock); + lock_write_to_read(&map->lock); + } + + rv = 0; + entry = start_entry; + while (entry != &map->header && entry->start < end) { + /* + * If vm_fault_wire fails for any page we need to + * undo what has been done. We decrement the wiring + * count for those pages which have not yet been + * wired (now) and unwire those that have (later). + * + * XXX this violates the locking protocol on the map, + * needs to be fixed. + */ + if (rv) + entry->wired_count--; + else if (entry->wired_count == 1) { + rv = vm_fault_wire(map, entry->start, entry->end); + if (rv) { + failed = entry->start; + entry->wired_count--; + } + } + entry = entry->next; + } + + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_lock(map); + } + else { + lock_clear_recursive(&map->lock); + } + if (rv) { + vm_map_unlock(map); + (void) vm_map_pageable(map, start, failed, TRUE); + return(rv); + } + } + + vm_map_unlock(map); + + return(KERN_SUCCESS); +} + +/* + * vm_map_clean + * + * Push any dirty cached pages in the address range to their pager. + * If syncio is TRUE, dirty pages are written synchronously. + * If invalidate is TRUE, any cached pages are freed as well. + * + * Returns an error if any part of the specified range is not mapped. + */ +int +vm_map_clean(map, start, end, syncio, invalidate) + vm_map_t map; + vm_offset_t start; + vm_offset_t end; + boolean_t syncio; + boolean_t invalidate; +{ + register vm_map_entry_t current; + vm_map_entry_t entry; + vm_size_t size; + vm_object_t object; + vm_offset_t offset; + + vm_map_lock_read(map); + VM_MAP_RANGE_CHECK(map, start, end); + if (!vm_map_lookup_entry(map, start, &entry)) { + vm_map_unlock_read(map); + return(KERN_INVALID_ADDRESS); + } + + /* + * Make a first pass to check for holes. + */ + for (current = entry; current->start < end; current = current->next) { + if (current->is_sub_map) { + vm_map_unlock_read(map); + return(KERN_INVALID_ARGUMENT); + } + if (end > current->end && + (current->next == &map->header || + current->end != current->next->start)) { + vm_map_unlock_read(map); + return(KERN_INVALID_ADDRESS); + } + } + + /* + * Make a second pass, cleaning/uncaching pages from the indicated + * objects as we go. + */ + for (current = entry; current->start < end; current = current->next) { + offset = current->offset + (start - current->start); + size = (end <= current->end ? end : current->end) - start; + if (current->is_a_map) { + register vm_map_t smap; + vm_map_entry_t tentry; + vm_size_t tsize; + + smap = current->object.share_map; + vm_map_lock_read(smap); + (void) vm_map_lookup_entry(smap, offset, &tentry); + tsize = tentry->end - offset; + if (tsize < size) + size = tsize; + object = tentry->object.vm_object; + offset = tentry->offset + (offset - tentry->start); + vm_object_lock(object); + vm_map_unlock_read(smap); + } else { + object = current->object.vm_object; + vm_object_lock(object); + } + /* + * Flush pages if writing is allowed. + * XXX should we continue on an error? + */ + if ((current->protection & VM_PROT_WRITE) && + !vm_object_page_clean(object, offset, offset+size, + syncio, FALSE)) { + vm_object_unlock(object); + vm_map_unlock_read(map); + return(KERN_FAILURE); + } + if (invalidate) + vm_object_page_remove(object, offset, offset+size); + vm_object_unlock(object); + start += size; + } + + vm_map_unlock_read(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_entry_unwire: [ internal use only ] + * + * Make the region specified by this entry pageable. + * + * The map in question should be locked. + * [This is the reason for this routine's existence.] + */ +void vm_map_entry_unwire(map, entry) + vm_map_t map; + register vm_map_entry_t entry; +{ + vm_fault_unwire(map, entry->start, entry->end); + entry->wired_count = 0; +} + +/* + * vm_map_entry_delete: [ internal use only ] + * + * Deallocate the given entry from the target map. + */ +void vm_map_entry_delete(map, entry) + register vm_map_t map; + register vm_map_entry_t entry; +{ + if (entry->wired_count != 0) + vm_map_entry_unwire(map, entry); + + vm_map_entry_unlink(map, entry); + map->size -= entry->end - entry->start; + + if (entry->is_a_map || entry->is_sub_map) + vm_map_deallocate(entry->object.share_map); + else + vm_object_deallocate(entry->object.vm_object); + + vm_map_entry_dispose(map, entry); +} + +/* + * vm_map_delete: [ internal use only ] + * + * Deallocates the given address range from the target + * map. + * + * When called with a sharing map, removes pages from + * that region from all physical maps. + */ +int +vm_map_delete(map, start, end) + register vm_map_t map; + vm_offset_t start; + register vm_offset_t end; +{ + register vm_map_entry_t entry; + vm_map_entry_t first_entry; + + /* + * Find the start of the region, and clip it + */ + + if (!vm_map_lookup_entry(map, start, &first_entry)) + entry = first_entry->next; + else { + entry = first_entry; + vm_map_clip_start(map, entry, start); + + /* + * Fix the lookup hint now, rather than each + * time though the loop. + */ + + SAVE_HINT(map, entry->prev); + } + + /* + * Save the free space hint + */ + + if (map->first_free->start >= start) + map->first_free = entry->prev; + + /* + * Step through all entries in this region + */ + + while ((entry != &map->header) && (entry->start < end)) { + vm_map_entry_t next; + register vm_offset_t s, e; + register vm_object_t object; + + vm_map_clip_end(map, entry, end); + + next = entry->next; + s = entry->start; + e = entry->end; + + /* + * Unwire before removing addresses from the pmap; + * otherwise, unwiring will put the entries back in + * the pmap. + */ + + object = entry->object.vm_object; + if (entry->wired_count != 0) + vm_map_entry_unwire(map, entry); + + /* + * If this is a sharing map, we must remove + * *all* references to this data, since we can't + * find all of the physical maps which are sharing + * it. + */ + + if (object == kernel_object || object == kmem_object) + vm_object_page_remove(object, entry->offset, + entry->offset + (e - s)); + else if (!map->is_main_map) + vm_object_pmap_remove(object, + entry->offset, + entry->offset + (e - s)); + else + pmap_remove(map->pmap, s, e); + + /* + * Delete the entry (which may delete the object) + * only after removing all pmap entries pointing + * to its pages. (Otherwise, its page frames may + * be reallocated, and any modify bits will be + * set in the wrong object!) + */ + + vm_map_entry_delete(map, entry); + entry = next; + } + return(KERN_SUCCESS); +} + +/* + * vm_map_remove: + * + * Remove the given address range from the target map. + * This is the exported form of vm_map_delete. + */ +int +vm_map_remove(map, start, end) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; +{ + register int result; + + vm_map_lock(map); + VM_MAP_RANGE_CHECK(map, start, end); + result = vm_map_delete(map, start, end); + vm_map_unlock(map); + + return(result); +} + +/* + * vm_map_check_protection: + * + * Assert that the target map allows the specified + * privilege on the entire address region given. + * The entire region must be allocated. + */ +boolean_t vm_map_check_protection(map, start, end, protection) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_prot_t protection; +{ + register vm_map_entry_t entry; + vm_map_entry_t tmp_entry; + + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + return(FALSE); + } + + entry = tmp_entry; + + while (start < end) { + if (entry == &map->header) { + return(FALSE); + } + + /* + * No holes allowed! + */ + + if (start < entry->start) { + return(FALSE); + } + + /* + * Check protection associated with entry. + */ + + if ((entry->protection & protection) != protection) { + return(FALSE); + } + + /* go to next entry */ + + start = entry->end; + entry = entry->next; + } + return(TRUE); +} + +/* + * vm_map_copy_entry: + * + * Copies the contents of the source entry to the destination + * entry. The entries *must* be aligned properly. + */ +void vm_map_copy_entry(src_map, dst_map, src_entry, dst_entry) + vm_map_t src_map, dst_map; + register vm_map_entry_t src_entry, dst_entry; +{ + vm_object_t temp_object; + + if (src_entry->is_sub_map || dst_entry->is_sub_map) + return; + + if (dst_entry->object.vm_object != NULL && + (dst_entry->object.vm_object->flags & OBJ_INTERNAL) == 0) + printf("vm_map_copy_entry: copying over permanent data!\n"); + + /* + * If our destination map was wired down, + * unwire it now. + */ + + if (dst_entry->wired_count != 0) + vm_map_entry_unwire(dst_map, dst_entry); + + /* + * If we're dealing with a sharing map, we + * must remove the destination pages from + * all maps (since we cannot know which maps + * this sharing map belongs in). + */ + + if (dst_map->is_main_map) + pmap_remove(dst_map->pmap, dst_entry->start, dst_entry->end); + else + vm_object_pmap_remove(dst_entry->object.vm_object, + dst_entry->offset, + dst_entry->offset + + (dst_entry->end - dst_entry->start)); + + if (src_entry->wired_count == 0) { + + boolean_t src_needs_copy; + + /* + * If the source entry is marked needs_copy, + * it is already write-protected. + */ + if (!src_entry->needs_copy) { + + boolean_t su; + + /* + * If the source entry has only one mapping, + * we can just protect the virtual address + * range. + */ + if (!(su = src_map->is_main_map)) { + simple_lock(&src_map->ref_lock); + su = (src_map->ref_count == 1); + simple_unlock(&src_map->ref_lock); + } + + if (su) { + pmap_protect(src_map->pmap, + src_entry->start, + src_entry->end, + src_entry->protection & ~VM_PROT_WRITE); + } + else { + vm_object_pmap_copy(src_entry->object.vm_object, + src_entry->offset, + src_entry->offset + (src_entry->end + -src_entry->start)); + } + } + + /* + * Make a copy of the object. + */ + temp_object = dst_entry->object.vm_object; + vm_object_copy(src_entry->object.vm_object, + src_entry->offset, + (vm_size_t)(src_entry->end - + src_entry->start), + &dst_entry->object.vm_object, + &dst_entry->offset, + &src_needs_copy); + /* + * If we didn't get a copy-object now, mark the + * source map entry so that a shadow will be created + * to hold its changed pages. + */ + if (src_needs_copy) + src_entry->needs_copy = TRUE; + + /* + * The destination always needs to have a shadow + * created. + */ + dst_entry->needs_copy = TRUE; + + /* + * Mark the entries copy-on-write, so that write-enabling + * the entry won't make copy-on-write pages writable. + */ + src_entry->copy_on_write = TRUE; + dst_entry->copy_on_write = TRUE; + /* + * Get rid of the old object. + */ + vm_object_deallocate(temp_object); + + pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, + dst_entry->end - dst_entry->start, src_entry->start); + } + else { + /* + * Of course, wired down pages can't be set copy-on-write. + * Cause wired pages to be copied into the new + * map by simulating faults (the new pages are + * pageable) + */ + vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); + } +} + +/* + * vm_map_copy: + * + * Perform a virtual memory copy from the source + * address map/range to the destination map/range. + * + * If src_destroy or dst_alloc is requested, + * the source and destination regions should be + * disjoint, not only in the top-level map, but + * in the sharing maps as well. [The best way + * to guarantee this is to use a new intermediate + * map to make copies. This also reduces map + * fragmentation.] + */ +int +vm_map_copy(dst_map, src_map, + dst_addr, len, src_addr, + dst_alloc, src_destroy) + vm_map_t dst_map; + vm_map_t src_map; + vm_offset_t dst_addr; + vm_size_t len; + vm_offset_t src_addr; + boolean_t dst_alloc; + boolean_t src_destroy; +{ + register + vm_map_entry_t src_entry; + register + vm_map_entry_t dst_entry; + vm_map_entry_t tmp_entry; + vm_offset_t src_start; + vm_offset_t src_end; + vm_offset_t dst_start; + vm_offset_t dst_end; + vm_offset_t src_clip; + vm_offset_t dst_clip; + int result; + boolean_t old_src_destroy; + + /* + * XXX While we figure out why src_destroy screws up, + * we'll do it by explicitly vm_map_delete'ing at the end. + */ + + old_src_destroy = src_destroy; + src_destroy = FALSE; + + /* + * Compute start and end of region in both maps + */ + + src_start = src_addr; + src_end = src_start + len; + dst_start = dst_addr; + dst_end = dst_start + len; + + /* + * Check that the region can exist in both source + * and destination. + */ + + if ((dst_end < dst_start) || (src_end < src_start)) + return(KERN_NO_SPACE); + + /* + * Lock the maps in question -- we avoid deadlock + * by ordering lock acquisition by map value + */ + + if (src_map == dst_map) { + vm_map_lock(src_map); + } + else if ((int) src_map < (int) dst_map) { + vm_map_lock(src_map); + vm_map_lock(dst_map); + } else { + vm_map_lock(dst_map); + vm_map_lock(src_map); + } + + result = KERN_SUCCESS; + + /* + * Check protections... source must be completely readable and + * destination must be completely writable. [Note that if we're + * allocating the destination region, we don't have to worry + * about protection, but instead about whether the region + * exists.] + */ + + if (src_map->is_main_map && dst_map->is_main_map) { + if (!vm_map_check_protection(src_map, src_start, src_end, + VM_PROT_READ)) { + result = KERN_PROTECTION_FAILURE; + goto Return; + } + + if (dst_alloc) { + /* XXX Consider making this a vm_map_find instead */ + if ((result = vm_map_insert(dst_map, NULL, + (vm_offset_t) 0, dst_start, dst_end)) != KERN_SUCCESS) + goto Return; + } + else if (!vm_map_check_protection(dst_map, dst_start, dst_end, + VM_PROT_WRITE)) { + result = KERN_PROTECTION_FAILURE; + goto Return; + } + } + + /* + * Find the start entries and clip. + * + * Note that checking protection asserts that the + * lookup cannot fail. + * + * Also note that we wait to do the second lookup + * until we have done the first clip, as the clip + * may affect which entry we get! + */ + + (void) vm_map_lookup_entry(src_map, src_addr, &tmp_entry); + src_entry = tmp_entry; + vm_map_clip_start(src_map, src_entry, src_start); + + (void) vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry); + dst_entry = tmp_entry; + vm_map_clip_start(dst_map, dst_entry, dst_start); + + /* + * If both source and destination entries are the same, + * retry the first lookup, as it may have changed. + */ + + if (src_entry == dst_entry) { + (void) vm_map_lookup_entry(src_map, src_addr, &tmp_entry); + src_entry = tmp_entry; + } + + /* + * If source and destination entries are still the same, + * a null copy is being performed. + */ + + if (src_entry == dst_entry) + goto Return; + + /* + * Go through entries until we get to the end of the + * region. + */ + + while (src_start < src_end) { + /* + * Clip the entries to the endpoint of the entire region. + */ + + vm_map_clip_end(src_map, src_entry, src_end); + vm_map_clip_end(dst_map, dst_entry, dst_end); + + /* + * Clip each entry to the endpoint of the other entry. + */ + + src_clip = src_entry->start + (dst_entry->end - dst_entry->start); + vm_map_clip_end(src_map, src_entry, src_clip); + + dst_clip = dst_entry->start + (src_entry->end - src_entry->start); + vm_map_clip_end(dst_map, dst_entry, dst_clip); + + /* + * Both entries now match in size and relative endpoints. + * + * If both entries refer to a VM object, we can + * deal with them now. + */ + + if (!src_entry->is_a_map && !dst_entry->is_a_map) { + vm_map_copy_entry(src_map, dst_map, src_entry, + dst_entry); + } + else { + register vm_map_t new_dst_map; + vm_offset_t new_dst_start; + vm_size_t new_size; + vm_map_t new_src_map; + vm_offset_t new_src_start; + + /* + * We have to follow at least one sharing map. + */ + + new_size = (dst_entry->end - dst_entry->start); + + if (src_entry->is_a_map) { + new_src_map = src_entry->object.share_map; + new_src_start = src_entry->offset; + } + else { + new_src_map = src_map; + new_src_start = src_entry->start; + lock_set_recursive(&src_map->lock); + } + + if (dst_entry->is_a_map) { + vm_offset_t new_dst_end; + + new_dst_map = dst_entry->object.share_map; + new_dst_start = dst_entry->offset; + + /* + * Since the destination sharing entries + * will be merely deallocated, we can + * do that now, and replace the region + * with a null object. [This prevents + * splitting the source map to match + * the form of the destination map.] + * Note that we can only do so if the + * source and destination do not overlap. + */ + + new_dst_end = new_dst_start + new_size; + + if (new_dst_map != new_src_map) { + vm_map_lock(new_dst_map); + (void) vm_map_delete(new_dst_map, + new_dst_start, + new_dst_end); + (void) vm_map_insert(new_dst_map, + NULL, + (vm_offset_t) 0, + new_dst_start, + new_dst_end); + vm_map_unlock(new_dst_map); + } + } + else { + new_dst_map = dst_map; + new_dst_start = dst_entry->start; + lock_set_recursive(&dst_map->lock); + } + + /* + * Recursively copy the sharing map. + */ + + (void) vm_map_copy(new_dst_map, new_src_map, + new_dst_start, new_size, new_src_start, + FALSE, FALSE); + + if (dst_map == new_dst_map) + lock_clear_recursive(&dst_map->lock); + if (src_map == new_src_map) + lock_clear_recursive(&src_map->lock); + } + + /* + * Update variables for next pass through the loop. + */ + + src_start = src_entry->end; + src_entry = src_entry->next; + dst_start = dst_entry->end; + dst_entry = dst_entry->next; + + /* + * If the source is to be destroyed, here is the + * place to do it. + */ + + if (src_destroy && src_map->is_main_map && + dst_map->is_main_map) + vm_map_entry_delete(src_map, src_entry->prev); + } + + /* + * Update the physical maps as appropriate + */ + + if (src_map->is_main_map && dst_map->is_main_map) { + if (src_destroy) + pmap_remove(src_map->pmap, src_addr, src_addr + len); + } + + /* + * Unlock the maps + */ + + Return: ; + + if (old_src_destroy) + vm_map_delete(src_map, src_addr, src_addr + len); + + vm_map_unlock(src_map); + if (src_map != dst_map) + vm_map_unlock(dst_map); + + return(result); +} + +/* + * vmspace_fork: + * Create a new process vmspace structure and vm_map + * based on those of an existing process. The new map + * is based on the old map, according to the inheritance + * values on the regions in that map. + * + * The source map must not be locked. + */ +struct vmspace * +vmspace_fork(vm1) + register struct vmspace *vm1; +{ + register struct vmspace *vm2; + vm_map_t old_map = &vm1->vm_map; + vm_map_t new_map; + vm_map_entry_t old_entry; + vm_map_entry_t new_entry; + pmap_t new_pmap; + + vm_map_lock(old_map); + + vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, + old_map->entries_pageable); + bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, + (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); + new_pmap = &vm2->vm_pmap; /* XXX */ + new_map = &vm2->vm_map; /* XXX */ + + old_entry = old_map->header.next; + + while (old_entry != &old_map->header) { + if (old_entry->is_sub_map) + panic("vm_map_fork: encountered a submap"); + + switch (old_entry->inheritance) { + case VM_INHERIT_NONE: + break; + + case VM_INHERIT_SHARE: + /* + * If we don't already have a sharing map: + */ + + if (!old_entry->is_a_map) { + vm_map_t new_share_map; + vm_map_entry_t new_share_entry; + + /* + * Create a new sharing map + */ + + new_share_map = vm_map_create(NULL, + old_entry->start, + old_entry->end, + TRUE); + new_share_map->is_main_map = FALSE; + + /* + * Create the only sharing entry from the + * old task map entry. + */ + + new_share_entry = + vm_map_entry_create(new_share_map); + *new_share_entry = *old_entry; + new_share_entry->wired_count = 0; + + /* + * Insert the entry into the new sharing + * map + */ + + vm_map_entry_link(new_share_map, + new_share_map->header.prev, + new_share_entry); + + /* + * Fix up the task map entry to refer + * to the sharing map now. + */ + + old_entry->is_a_map = TRUE; + old_entry->object.share_map = new_share_map; + old_entry->offset = old_entry->start; + } + + /* + * Clone the entry, referencing the sharing map. + */ + + new_entry = vm_map_entry_create(new_map); + *new_entry = *old_entry; + new_entry->wired_count = 0; + vm_map_reference(new_entry->object.share_map); + + /* + * Insert the entry into the new map -- we + * know we're inserting at the end of the new + * map. + */ + + vm_map_entry_link(new_map, new_map->header.prev, + new_entry); + + /* + * Update the physical map + */ + + pmap_copy(new_map->pmap, old_map->pmap, + new_entry->start, + (old_entry->end - old_entry->start), + old_entry->start); + break; + + case VM_INHERIT_COPY: + /* + * Clone the entry and link into the map. + */ + + new_entry = vm_map_entry_create(new_map); + *new_entry = *old_entry; + new_entry->wired_count = 0; + new_entry->object.vm_object = NULL; + new_entry->is_a_map = FALSE; + vm_map_entry_link(new_map, new_map->header.prev, + new_entry); + if (old_entry->is_a_map) { + int check; + + check = vm_map_copy(new_map, + old_entry->object.share_map, + new_entry->start, + (vm_size_t)(new_entry->end - + new_entry->start), + old_entry->offset, + FALSE, FALSE); + if (check != KERN_SUCCESS) + printf("vm_map_fork: copy in share_map region failed\n"); + } + else { + vm_map_copy_entry(old_map, new_map, old_entry, + new_entry); + } + break; + } + old_entry = old_entry->next; + } + + new_map->size = old_map->size; + vm_map_unlock(old_map); + + return(vm2); +} + +/* + * vm_map_lookup: + * + * Finds the VM object, offset, and + * protection for a given virtual address in the + * specified map, assuming a page fault of the + * type specified. + * + * Leaves the map in question locked for read; return + * values are guaranteed until a vm_map_lookup_done + * call is performed. Note that the map argument + * is in/out; the returned map must be used in + * the call to vm_map_lookup_done. + * + * A handle (out_entry) is returned for use in + * vm_map_lookup_done, to make that fast. + * + * If a lookup is requested with "write protection" + * specified, the map may be changed to perform virtual + * copying operations, although the data referenced will + * remain the same. + */ +int +vm_map_lookup(var_map, vaddr, fault_type, out_entry, + object, offset, out_prot, wired, single_use) + vm_map_t *var_map; /* IN/OUT */ + register vm_offset_t vaddr; + register vm_prot_t fault_type; + + vm_map_entry_t *out_entry; /* OUT */ + vm_object_t *object; /* OUT */ + vm_offset_t *offset; /* OUT */ + vm_prot_t *out_prot; /* OUT */ + boolean_t *wired; /* OUT */ + boolean_t *single_use; /* OUT */ +{ + vm_map_t share_map; + vm_offset_t share_offset; + register vm_map_entry_t entry; + register vm_map_t map = *var_map; + register vm_prot_t prot; + register boolean_t su; + + RetryLookup: ; + + /* + * Lookup the faulting address. + */ + + vm_map_lock_read(map); + +#define RETURN(why) \ + { \ + vm_map_unlock_read(map); \ + return(why); \ + } + + /* + * If the map has an interesting hint, try it before calling + * full blown lookup routine. + */ + + simple_lock(&map->hint_lock); + entry = map->hint; + simple_unlock(&map->hint_lock); + + *out_entry = entry; + + if ((entry == &map->header) || + (vaddr < entry->start) || (vaddr >= entry->end)) { + vm_map_entry_t tmp_entry; + + /* + * Entry was either not a valid hint, or the vaddr + * was not contained in the entry, so do a full lookup. + */ + if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) + RETURN(KERN_INVALID_ADDRESS); + + entry = tmp_entry; + *out_entry = entry; + } + + /* + * Handle submaps. + */ + + if (entry->is_sub_map) { + vm_map_t old_map = map; + + *var_map = map = entry->object.sub_map; + vm_map_unlock_read(old_map); + goto RetryLookup; + } + + /* + * Check whether this task is allowed to have + * this page. + */ + + prot = entry->protection; + if ((fault_type & (prot)) != fault_type) + RETURN(KERN_PROTECTION_FAILURE); + + /* + * If this page is not pageable, we have to get + * it for all possible accesses. + */ + + if (*wired = (entry->wired_count != 0)) + prot = fault_type = entry->protection; + + /* + * If we don't already have a VM object, track + * it down. + */ + + if (su = !entry->is_a_map) { + share_map = map; + share_offset = vaddr; + } + else { + vm_map_entry_t share_entry; + + /* + * Compute the sharing map, and offset into it. + */ + + share_map = entry->object.share_map; + share_offset = (vaddr - entry->start) + entry->offset; + + /* + * Look for the backing store object and offset + */ + + vm_map_lock_read(share_map); + + if (!vm_map_lookup_entry(share_map, share_offset, + &share_entry)) { + vm_map_unlock_read(share_map); + RETURN(KERN_INVALID_ADDRESS); + } + entry = share_entry; + } + + /* + * If the entry was copy-on-write, we either ... + */ + + if (entry->needs_copy) { + /* + * If we want to write the page, we may as well + * handle that now since we've got the sharing + * map locked. + * + * If we don't need to write the page, we just + * demote the permissions allowed. + */ + + if (fault_type & VM_PROT_WRITE) { + /* + * Make a new object, and place it in the + * object chain. Note that no new references + * have appeared -- one just moved from the + * share map to the new object. + */ + + if (lock_read_to_write(&share_map->lock)) { + if (share_map != map) + vm_map_unlock_read(map); + goto RetryLookup; + } + + vm_object_shadow( + &entry->object.vm_object, + &entry->offset, + (vm_size_t) (entry->end - entry->start)); + + entry->needs_copy = FALSE; + + lock_write_to_read(&share_map->lock); + } + else { + /* + * We're attempting to read a copy-on-write + * page -- don't allow writes. + */ + + prot &= (~VM_PROT_WRITE); + } + } + + /* + * Create an object if necessary. + */ + if (entry->object.vm_object == NULL) { + + if (lock_read_to_write(&share_map->lock)) { + if (share_map != map) + vm_map_unlock_read(map); + goto RetryLookup; + } + + entry->object.vm_object = vm_object_allocate( + (vm_size_t)(entry->end - entry->start)); + entry->offset = 0; + lock_write_to_read(&share_map->lock); + } + + /* + * Return the object/offset from this entry. If the entry + * was copy-on-write or empty, it has been fixed up. + */ + + *offset = (share_offset - entry->start) + entry->offset; + *object = entry->object.vm_object; + + /* + * Return whether this is the only map sharing this data. + */ + + if (!su) { + simple_lock(&share_map->ref_lock); + su = (share_map->ref_count == 1); + simple_unlock(&share_map->ref_lock); + } + + *out_prot = prot; + *single_use = su; + + return(KERN_SUCCESS); + +#undef RETURN +} + +/* + * vm_map_lookup_done: + * + * Releases locks acquired by a vm_map_lookup + * (according to the handle returned by that lookup). + */ + +void vm_map_lookup_done(map, entry) + register vm_map_t map; + vm_map_entry_t entry; +{ + /* + * If this entry references a map, unlock it first. + */ + + if (entry->is_a_map) + vm_map_unlock_read(entry->object.share_map); + + /* + * Unlock the main-level map + */ + + vm_map_unlock_read(map); +} + +/* + * Routine: vm_map_simplify + * Purpose: + * Attempt to simplify the map representation in + * the vicinity of the given starting address. + * Note: + * This routine is intended primarily to keep the + * kernel maps more compact -- they generally don't + * benefit from the "expand a map entry" technology + * at allocation time because the adjacent entry + * is often wired down. + */ +void vm_map_simplify(map, start) + vm_map_t map; + vm_offset_t start; +{ + vm_map_entry_t this_entry; + vm_map_entry_t prev_entry; + + vm_map_lock(map); + if ( + (vm_map_lookup_entry(map, start, &this_entry)) && + ((prev_entry = this_entry->prev) != &map->header) && + + (prev_entry->end == start) && + (map->is_main_map) && + + (prev_entry->is_a_map == FALSE) && + (prev_entry->is_sub_map == FALSE) && + + (this_entry->is_a_map == FALSE) && + (this_entry->is_sub_map == FALSE) && + + (prev_entry->inheritance == this_entry->inheritance) && + (prev_entry->protection == this_entry->protection) && + (prev_entry->max_protection == this_entry->max_protection) && + (prev_entry->wired_count == this_entry->wired_count) && + + (prev_entry->copy_on_write == this_entry->copy_on_write) && + (prev_entry->needs_copy == this_entry->needs_copy) && + + (prev_entry->object.vm_object == this_entry->object.vm_object) && + ((prev_entry->offset + (prev_entry->end - prev_entry->start)) + == this_entry->offset) + ) { + if (map->first_free == this_entry) + map->first_free = prev_entry; + + if (!this_entry->object.vm_object->paging_in_progress) { + SAVE_HINT(map, prev_entry); + vm_map_entry_unlink(map, this_entry); + prev_entry->end = this_entry->end; + vm_object_deallocate(this_entry->object.vm_object); + vm_map_entry_dispose(map, this_entry); + } + } + vm_map_unlock(map); +} + +/* + * vm_map_print: [ debug ] + */ +void vm_map_print(map, full) + register vm_map_t map; + boolean_t full; +{ + register vm_map_entry_t entry; + extern int indent; + + iprintf("%s map 0x%x: pmap=0x%x,ref=%d,nentries=%d,version=%d\n", + (map->is_main_map ? "Task" : "Share"), + (int) map, (int) (map->pmap), map->ref_count, map->nentries, + map->timestamp); + + if (!full && indent) + return; + + indent += 2; + for (entry = map->header.next; entry != &map->header; + entry = entry->next) { + iprintf("map entry 0x%x: start=0x%x, end=0x%x, ", + (int) entry, (int) entry->start, (int) entry->end); + if (map->is_main_map) { + static char *inheritance_name[4] = + { "share", "copy", "none", "donate_copy"}; + printf("prot=%x/%x/%s, ", + entry->protection, + entry->max_protection, + inheritance_name[entry->inheritance]); + if (entry->wired_count != 0) + printf("wired, "); + } + + if (entry->is_a_map || entry->is_sub_map) { + printf("share=0x%x, offset=0x%x\n", + (int) entry->object.share_map, + (int) entry->offset); + if ((entry->prev == &map->header) || + (!entry->prev->is_a_map) || + (entry->prev->object.share_map != + entry->object.share_map)) { + indent += 2; + vm_map_print(entry->object.share_map, full); + indent -= 2; + } + + } + else { + printf("object=0x%x, offset=0x%x", + (int) entry->object.vm_object, + (int) entry->offset); + if (entry->copy_on_write) + printf(", copy (%s)", + entry->needs_copy ? "needed" : "done"); + printf("\n"); + + if ((entry->prev == &map->header) || + (entry->prev->is_a_map) || + (entry->prev->object.vm_object != + entry->object.vm_object)) { + indent += 2; + vm_object_print(entry->object.vm_object, full); + indent -= 2; + } + } + } + indent -= 2; +} diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h new file mode 100644 index 0000000..ee253ef --- /dev/null +++ b/sys/vm/vm_map.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_map.h 8.3 (Berkeley) 3/15/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory map module definitions. + */ + +#ifndef _VM_MAP_ +#define _VM_MAP_ + +/* + * Types defined: + * + * vm_map_t the high-level address map data structure. + * vm_map_entry_t an entry in an address map. + * vm_map_version_t a timestamp of a map, for use with vm_map_lookup + */ + +/* + * Objects which live in maps may be either VM objects, or + * another map (called a "sharing map") which denotes read-write + * sharing with other maps. + */ + +union vm_map_object { + struct vm_object *vm_object; /* object object */ + struct vm_map *share_map; /* share map */ + struct vm_map *sub_map; /* belongs to another map */ +}; + +/* + * Address map entries consist of start and end addresses, + * a VM object (or sharing map) and offset into that object, + * and user-exported inheritance and protection information. + * Also included is control information for virtual copy operations. + */ +struct vm_map_entry { + struct vm_map_entry *prev; /* previous entry */ + struct vm_map_entry *next; /* next entry */ + vm_offset_t start; /* start address */ + vm_offset_t end; /* end address */ + union vm_map_object object; /* object I point to */ + vm_offset_t offset; /* offset into object */ + boolean_t is_a_map:1, /* Is "object" a map? */ + is_sub_map:1, /* Is "object" a submap? */ + /* Only in sharing maps: */ + copy_on_write:1,/* is data copy-on-write */ + needs_copy:1; /* does object need to be copied */ + /* Only in task maps: */ + vm_prot_t protection; /* protection code */ + vm_prot_t max_protection; /* maximum protection */ + vm_inherit_t inheritance; /* inheritance */ + int wired_count; /* can be paged if = 0 */ +}; + +/* + * Maps are doubly-linked lists of map entries, kept sorted + * by address. A single hint is provided to start + * searches again from the last successful search, + * insertion, or removal. + */ +struct vm_map { + struct pmap * pmap; /* Physical map */ + lock_data_t lock; /* Lock for map data */ + struct vm_map_entry header; /* List of entries */ + int nentries; /* Number of entries */ + vm_size_t size; /* virtual size */ + boolean_t is_main_map; /* Am I a main map? */ + int ref_count; /* Reference count */ + simple_lock_data_t ref_lock; /* Lock for ref_count field */ + vm_map_entry_t hint; /* hint for quick lookups */ + simple_lock_data_t hint_lock; /* lock for hint storage */ + vm_map_entry_t first_free; /* First free space hint */ + boolean_t entries_pageable; /* map entries pageable?? */ + unsigned int timestamp; /* Version number */ +#define min_offset header.start +#define max_offset header.end +}; + +/* + * Map versions are used to validate a previous lookup attempt. + * + * Since lookup operations may involve both a main map and + * a sharing map, it is necessary to have a timestamp from each. + * [If the main map timestamp has changed, the share_map and + * associated timestamp are no longer valid; the map version + * does not include a reference for the imbedded share_map.] + */ +typedef struct { + int main_timestamp; + vm_map_t share_map; + int share_timestamp; +} vm_map_version_t; + +/* + * Macros: vm_map_lock, etc. + * Function: + * Perform locking on the data portion of a map. + */ + +#define vm_map_lock(map) { \ + lock_write(&(map)->lock); \ + (map)->timestamp++; \ +} +#define vm_map_unlock(map) lock_write_done(&(map)->lock) +#define vm_map_lock_read(map) lock_read(&(map)->lock) +#define vm_map_unlock_read(map) lock_read_done(&(map)->lock) + +/* + * Functions implemented as macros + */ +#define vm_map_min(map) ((map)->min_offset) +#define vm_map_max(map) ((map)->max_offset) +#define vm_map_pmap(map) ((map)->pmap) + +/* XXX: number of kernel maps and entries to statically allocate */ +#define MAX_KMAP 10 +#define MAX_KMAPENT 128 + +#ifdef KERNEL +boolean_t vm_map_check_protection __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_prot_t)); +int vm_map_copy __P((vm_map_t, vm_map_t, vm_offset_t, + vm_size_t, vm_offset_t, boolean_t, boolean_t)); +void vm_map_copy_entry __P((vm_map_t, + vm_map_t, vm_map_entry_t, vm_map_entry_t)); +struct pmap; +vm_map_t vm_map_create __P((struct pmap *, + vm_offset_t, vm_offset_t, boolean_t)); +void vm_map_deallocate __P((vm_map_t)); +int vm_map_delete __P((vm_map_t, vm_offset_t, vm_offset_t)); +vm_map_entry_t vm_map_entry_create __P((vm_map_t)); +void vm_map_entry_delete __P((vm_map_t, vm_map_entry_t)); +void vm_map_entry_dispose __P((vm_map_t, vm_map_entry_t)); +void vm_map_entry_unwire __P((vm_map_t, vm_map_entry_t)); +int vm_map_find __P((vm_map_t, vm_object_t, + vm_offset_t, vm_offset_t *, vm_size_t, boolean_t)); +int vm_map_findspace __P((vm_map_t, + vm_offset_t, vm_size_t, vm_offset_t *)); +int vm_map_inherit __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_inherit_t)); +void vm_map_init __P((struct vm_map *, + vm_offset_t, vm_offset_t, boolean_t)); +int vm_map_insert __P((vm_map_t, + vm_object_t, vm_offset_t, vm_offset_t, vm_offset_t)); +int vm_map_lookup __P((vm_map_t *, vm_offset_t, vm_prot_t, + vm_map_entry_t *, vm_object_t *, vm_offset_t *, vm_prot_t *, + boolean_t *, boolean_t *)); +void vm_map_lookup_done __P((vm_map_t, vm_map_entry_t)); +boolean_t vm_map_lookup_entry __P((vm_map_t, + vm_offset_t, vm_map_entry_t *)); +int vm_map_pageable __P((vm_map_t, + vm_offset_t, vm_offset_t, boolean_t)); +int vm_map_clean __P((vm_map_t, + vm_offset_t, vm_offset_t, boolean_t, boolean_t)); +void vm_map_print __P((vm_map_t, boolean_t)); +int vm_map_protect __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); +void vm_map_reference __P((vm_map_t)); +int vm_map_remove __P((vm_map_t, vm_offset_t, vm_offset_t)); +void vm_map_simplify __P((vm_map_t, vm_offset_t)); +void vm_map_simplify_entry __P((vm_map_t, vm_map_entry_t)); +void vm_map_startup __P((void)); +int vm_map_submap __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_map_t)); +#endif +#endif /* _VM_MAP_ */ diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c new file mode 100644 index 0000000..2a8029b --- /dev/null +++ b/sys/vm/vm_meter.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <vm/vm.h> +#include <sys/sysctl.h> + +struct loadavg averunnable; /* load average, of runnable procs */ + +int maxslp = MAXSLP; +int saferss = SAFERSS; + +void +vmmeter() +{ + + if (time.tv_sec % 5 == 0) + loadav(&averunnable); + if (proc0.p_slptime > maxslp/2) + wakeup((caddr_t)&proc0); +} + +/* + * Constants for averages over 1, 5, and 15 minutes + * when sampling at 5 second intervals. + */ +fixpt_t cexp[3] = { + 0.9200444146293232 * FSCALE, /* exp(-1/12) */ + 0.9834714538216174 * FSCALE, /* exp(-1/60) */ + 0.9944598480048967 * FSCALE, /* exp(-1/180) */ +}; + +/* + * Compute a tenex style load average of a quantity on + * 1, 5 and 15 minute intervals. + */ +void +loadav(avg) + register struct loadavg *avg; +{ + register int i, nrun; + register struct proc *p; + + for (nrun = 0, p = (struct proc *)allproc; p != NULL; p = p->p_next) { + switch (p->p_stat) { + case SSLEEP: + if (p->p_priority > PZERO || p->p_slptime != 0) + continue; + /* fall through */ + case SRUN: + case SIDL: + nrun++; + } + } + for (i = 0; i < 3; i++) + avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; +} + +/* + * Attributes associated with virtual memory. + */ +int +vm_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + struct vmtotal vmtotals; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case VM_LOADAVG: + averunnable.fscale = FSCALE; + return (sysctl_rdstruct(oldp, oldlenp, newp, &averunnable, + sizeof(averunnable))); + case VM_METER: + vmtotal(&vmtotals); + return (sysctl_rdstruct(oldp, oldlenp, newp, &vmtotals, + sizeof(vmtotals))); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +/* + * Calculate the current state of the system. + * Done on demand from getkerninfo(). + */ +void +vmtotal(totalp) + register struct vmtotal *totalp; +{ + register struct proc *p; + register vm_map_entry_t entry; + register vm_object_t object; + register vm_map_t map; + int paging; + + bzero(totalp, sizeof *totalp); + /* + * Mark all objects as inactive. + */ + simple_lock(&vm_object_list_lock); + for (object = vm_object_list.tqh_first; + object != NULL; + object = object->object_list.tqe_next) + object->flags &= ~OBJ_ACTIVE; + simple_unlock(&vm_object_list_lock); + /* + * Calculate process statistics. + */ + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_flag & P_SYSTEM) + continue; + switch (p->p_stat) { + case 0: + continue; + + case SSLEEP: + case SSTOP: + if (p->p_flag & P_INMEM) { + if (p->p_priority <= PZERO) + totalp->t_dw++; + else if (p->p_slptime < maxslp) + totalp->t_sl++; + } else if (p->p_slptime < maxslp) + totalp->t_sw++; + if (p->p_slptime >= maxslp) + continue; + break; + + case SRUN: + case SIDL: + if (p->p_flag & P_INMEM) + totalp->t_rq++; + else + totalp->t_sw++; + if (p->p_stat == SIDL) + continue; + break; + } + /* + * Note active objects. + */ + paging = 0; + for (map = &p->p_vmspace->vm_map, entry = map->header.next; + entry != &map->header; entry = entry->next) { + if (entry->is_a_map || entry->is_sub_map || + entry->object.vm_object == NULL) + continue; + entry->object.vm_object->flags |= OBJ_ACTIVE; + paging |= entry->object.vm_object->paging_in_progress; + } + if (paging) + totalp->t_pw++; + } + /* + * Calculate object memory usage statistics. + */ + simple_lock(&vm_object_list_lock); + for (object = vm_object_list.tqh_first; + object != NULL; + object = object->object_list.tqe_next) { + totalp->t_vm += num_pages(object->size); + totalp->t_rm += object->resident_page_count; + if (object->flags & OBJ_ACTIVE) { + totalp->t_avm += num_pages(object->size); + totalp->t_arm += object->resident_page_count; + } + if (object->ref_count > 1) { + /* shared object */ + totalp->t_vmshr += num_pages(object->size); + totalp->t_rmshr += object->resident_page_count; + if (object->flags & OBJ_ACTIVE) { + totalp->t_avmshr += num_pages(object->size); + totalp->t_armshr += object->resident_page_count; + } + } + } + totalp->t_free = cnt.v_free_count; +} diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c new file mode 100644 index 0000000..2e7204a --- /dev/null +++ b/sys/vm/vm_mmap.c @@ -0,0 +1,836 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ + * + * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 + */ + +/* + * Mapped file (mmap) interface to VM + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filedesc.h> +#include <sys/resourcevar.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mman.h> +#include <sys/conf.h> + +#include <miscfs/specfs/specdev.h> + +#include <vm/vm.h> +#include <vm/vm_pager.h> +#include <vm/vm_prot.h> + +#ifdef DEBUG +int mmapdebug = 0; +#define MDB_FOLLOW 0x01 +#define MDB_SYNC 0x02 +#define MDB_MAPIT 0x04 +#endif + +struct sbrk_args { + int incr; +}; +/* ARGSUSED */ +int +sbrk(p, uap, retval) + struct proc *p; + struct sbrk_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +struct sstk_args { + int incr; +}; +/* ARGSUSED */ +int +sstk(p, uap, retval) + struct proc *p; + struct sstk_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +struct getpagesize_args { + int dummy; +}; +/* ARGSUSED */ +int +ogetpagesize(p, uap, retval) + struct proc *p; + struct getpagesize_args *uap; + int *retval; +{ + + *retval = PAGE_SIZE; + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +struct mmap_args { + caddr_t addr; + size_t len; + int prot; + int flags; + int fd; + long pad; + off_t pos; +}; + +#ifdef COMPAT_43 +struct ommap_args { + caddr_t addr; + int len; + int prot; + int flags; + int fd; + long pos; +}; +int +ommap(p, uap, retval) + struct proc *p; + register struct ommap_args *uap; + int *retval; +{ + struct mmap_args nargs; + static const char cvtbsdprot[8] = { + 0, + PROT_EXEC, + PROT_WRITE, + PROT_EXEC|PROT_WRITE, + PROT_READ, + PROT_EXEC|PROT_READ, + PROT_WRITE|PROT_READ, + PROT_EXEC|PROT_WRITE|PROT_READ, + }; +#define OMAP_ANON 0x0002 +#define OMAP_COPY 0x0020 +#define OMAP_SHARED 0x0010 +#define OMAP_FIXED 0x0100 +#define OMAP_INHERIT 0x0800 + + nargs.addr = uap->addr; + nargs.len = uap->len; + nargs.prot = cvtbsdprot[uap->prot&0x7]; + nargs.flags = 0; + if (uap->flags & OMAP_ANON) + nargs.flags |= MAP_ANON; + if (uap->flags & OMAP_COPY) + nargs.flags |= MAP_COPY; + if (uap->flags & OMAP_SHARED) + nargs.flags |= MAP_SHARED; + else + nargs.flags |= MAP_PRIVATE; + if (uap->flags & OMAP_FIXED) + nargs.flags |= MAP_FIXED; + if (uap->flags & OMAP_INHERIT) + nargs.flags |= MAP_INHERIT; + nargs.fd = uap->fd; + nargs.pos = uap->pos; + return (mmap(p, &nargs, retval)); +} +#endif + +int +mmap(p, uap, retval) + struct proc *p; + register struct mmap_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vnode *vp; + vm_offset_t addr; + vm_size_t size; + vm_prot_t prot, maxprot; + caddr_t handle; + int flags, error; + + prot = uap->prot & VM_PROT_ALL; + flags = uap->flags; +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("mmap(%d): addr %x len %x pro %x flg %x fd %d pos %x\n", + p->p_pid, uap->addr, uap->len, prot, + flags, uap->fd, (vm_offset_t)uap->pos); +#endif + /* + * Address (if FIXED) must be page aligned. + * Size is implicitly rounded to a page boundary. + */ + addr = (vm_offset_t) uap->addr; + if (((flags & MAP_FIXED) && (addr & PAGE_MASK)) || + (ssize_t)uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) + return (EINVAL); + size = (vm_size_t) round_page(uap->len); + /* + * Check for illegal addresses. Watch out for address wrap... + * Note that VM_*_ADDRESS are not constants due to casts (argh). + */ + if (flags & MAP_FIXED) { + if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) + return (EINVAL); +#ifndef i386 + if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) + return (EINVAL); +#endif + if (addr > addr + size) + return (EINVAL); + } + /* + * XXX if no hint provided for a non-fixed mapping place it after + * the end of the largest possible heap. + * + * There should really be a pmap call to determine a reasonable + * location. + */ + if (addr == 0 && (flags & MAP_FIXED) == 0) + addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); + if (flags & MAP_ANON) { + /* + * Mapping blank space is trivial. + */ + handle = NULL; + maxprot = VM_PROT_ALL; + } else { + /* + * Mapping file, get fp for validation. + * Obtain vnode and make sure it is of appropriate type. + */ + if (((unsigned)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + if (vp->v_type != VREG && vp->v_type != VCHR) + return (EINVAL); + /* + * XXX hack to handle use of /dev/zero to map anon + * memory (ala SunOS). + */ + if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { + handle = NULL; + maxprot = VM_PROT_ALL; + flags |= MAP_ANON; + } else { + /* + * Ensure that file and memory protections are + * compatible. Note that we only worry about + * writability if mapping is shared; in this case, + * current and max prot are dictated by the open file. + * XXX use the vnode instead? Problem is: what + * credentials do we use for determination? + * What if proc does a setuid? + */ + maxprot = VM_PROT_EXECUTE; /* ??? */ + if (fp->f_flag & FREAD) + maxprot |= VM_PROT_READ; + else if (prot & PROT_READ) + return (EACCES); + if (flags & MAP_SHARED) { + if (fp->f_flag & FWRITE) + maxprot |= VM_PROT_WRITE; + else if (prot & PROT_WRITE) + return (EACCES); + } else + maxprot |= VM_PROT_WRITE; + handle = (caddr_t)vp; + } + } + error = vm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, + flags, handle, (vm_offset_t)uap->pos); + if (error == 0) + *retval = (int)addr; + return (error); +} + +struct msync_args { + caddr_t addr; + int len; +}; +int +msync(p, uap, retval) + struct proc *p; + struct msync_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + vm_map_t map; + int rv; + boolean_t syncio, invalidate; + +#ifdef DEBUG + if (mmapdebug & (MDB_FOLLOW|MDB_SYNC)) + printf("msync(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + if (((int)uap->addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) + return (EINVAL); + map = &p->p_vmspace->vm_map; + addr = (vm_offset_t)uap->addr; + size = (vm_size_t)uap->len; + /* + * XXX Gak! If size is zero we are supposed to sync "all modified + * pages with the region containing addr". Unfortunately, we + * don't really keep track of individual mmaps so we approximate + * by flushing the range of the map entry containing addr. + * This can be incorrect if the region splits or is coalesced + * with a neighbor. + */ + if (size == 0) { + vm_map_entry_t entry; + + vm_map_lock_read(map); + rv = vm_map_lookup_entry(map, addr, &entry); + vm_map_unlock_read(map); + if (rv) + return (EINVAL); + addr = entry->start; + size = entry->end - entry->start; + } +#ifdef DEBUG + if (mmapdebug & MDB_SYNC) + printf("msync: cleaning/flushing address range [%x-%x)\n", + addr, addr+size); +#endif + /* + * Could pass this in as a third flag argument to implement + * Sun's MS_ASYNC. + */ + syncio = TRUE; + /* + * XXX bummer, gotta flush all cached pages to ensure + * consistency with the file system cache. Otherwise, we could + * pass this in to implement Sun's MS_INVALIDATE. + */ + invalidate = TRUE; + /* + * Clean the pages and interpret the return value. + */ + rv = vm_map_clean(map, addr, addr+size, syncio, invalidate); + switch (rv) { + case KERN_SUCCESS: + break; + case KERN_INVALID_ADDRESS: + return (EINVAL); /* Sun returns ENOMEM? */ + case KERN_FAILURE: + return (EIO); + default: + return (EINVAL); + } + return (0); +} + +struct munmap_args { + caddr_t addr; + int len; +}; +int +munmap(p, uap, retval) + register struct proc *p; + register struct munmap_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + vm_map_t map; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("munmap(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + + addr = (vm_offset_t) uap->addr; + if ((addr & PAGE_MASK) || uap->len < 0) + return(EINVAL); + size = (vm_size_t) round_page(uap->len); + if (size == 0) + return(0); + /* + * Check for illegal addresses. Watch out for address wrap... + * Note that VM_*_ADDRESS are not constants due to casts (argh). + */ + if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) + return (EINVAL); +#ifndef i386 + if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) + return (EINVAL); +#endif + if (addr > addr + size) + return (EINVAL); + map = &p->p_vmspace->vm_map; + /* + * Make sure entire range is allocated. + */ + if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) + return(EINVAL); + /* returns nothing but KERN_SUCCESS anyway */ + (void) vm_map_remove(map, addr, addr+size); + return(0); +} + +void +munmapfd(fd) + int fd; +{ +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("munmapfd(%d): fd %d\n", curproc->p_pid, fd); +#endif + + /* + * XXX should vm_deallocate any regions mapped to this file + */ + curproc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; +} + +struct mprotect_args { + caddr_t addr; + int len; + int prot; +}; +int +mprotect(p, uap, retval) + struct proc *p; + struct mprotect_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + register vm_prot_t prot; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("mprotect(%d): addr %x len %x prot %d\n", + p->p_pid, uap->addr, uap->len, uap->prot); +#endif + + addr = (vm_offset_t)uap->addr; + if ((addr & PAGE_MASK) || uap->len < 0) + return(EINVAL); + size = (vm_size_t)uap->len; + prot = uap->prot & VM_PROT_ALL; + + switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr+size, prot, + FALSE)) { + case KERN_SUCCESS: + return (0); + case KERN_PROTECTION_FAILURE: + return (EACCES); + } + return (EINVAL); +} + +struct madvise_args { + caddr_t addr; + int len; + int behav; +}; +/* ARGSUSED */ +int +madvise(p, uap, retval) + struct proc *p; + struct madvise_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +struct mincore_args { + caddr_t addr; + int len; + char *vec; +}; +/* ARGSUSED */ +int +mincore(p, uap, retval) + struct proc *p; + struct mincore_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +struct mlock_args { + caddr_t addr; + size_t len; +}; +int +mlock(p, uap, retval) + struct proc *p; + struct mlock_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + int error; + extern int vm_page_max_wired; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("mlock(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + addr = (vm_offset_t)uap->addr; + if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) + return (EINVAL); + size = round_page((vm_size_t)uap->len); + if (atop(size) + cnt.v_wire_count > vm_page_max_wired) + return (EAGAIN); +#ifdef pmap_wired_count + if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > + p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) + return (EAGAIN); +#else + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); +#endif + + error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE); + return (error == KERN_SUCCESS ? 0 : ENOMEM); +} + +struct munlock_args { + caddr_t addr; + size_t len; +}; +int +munlock(p, uap, retval) + struct proc *p; + struct munlock_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + int error; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("munlock(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + addr = (vm_offset_t)uap->addr; + if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) + return (EINVAL); +#ifndef pmap_wired_count + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); +#endif + size = round_page((vm_size_t)uap->len); + + error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE); + return (error == KERN_SUCCESS ? 0 : ENOMEM); +} + +/* + * Internal version of mmap. + * Currently used by mmap, exec, and sys5 shared memory. + * Handle is either a vnode pointer or NULL for MAP_ANON. + */ +int +vm_mmap(map, addr, size, prot, maxprot, flags, handle, foff) + register vm_map_t map; + register vm_offset_t *addr; + register vm_size_t size; + vm_prot_t prot, maxprot; + register int flags; + caddr_t handle; /* XXX should be vp */ + vm_offset_t foff; +{ + register vm_pager_t pager; + boolean_t fitit; + vm_object_t object; + struct vnode *vp = NULL; + int type; + int rv = KERN_SUCCESS; + + if (size == 0) + return (0); + + if ((flags & MAP_FIXED) == 0) { + fitit = TRUE; + *addr = round_page(*addr); + } else { + fitit = FALSE; + (void)vm_deallocate(map, *addr, size); + } + + /* + * Lookup/allocate pager. All except an unnamed anonymous lookup + * gain a reference to ensure continued existance of the object. + * (XXX the exception is to appease the pageout daemon) + */ + if (flags & MAP_ANON) + type = PG_DFLT; + else { + vp = (struct vnode *)handle; + if (vp->v_type == VCHR) { + type = PG_DEVICE; + handle = (caddr_t)vp->v_rdev; + } else + type = PG_VNODE; + } + pager = vm_pager_allocate(type, handle, size, prot, foff); + if (pager == NULL) + return (type == PG_DEVICE ? EINVAL : ENOMEM); + /* + * Find object and release extra reference gained by lookup + */ + object = vm_object_lookup(pager); + vm_object_deallocate(object); + + /* + * Anonymous memory. + */ + if (flags & MAP_ANON) { + rv = vm_allocate_with_pager(map, addr, size, fitit, + pager, foff, TRUE); + if (rv != KERN_SUCCESS) { + if (handle == NULL) + vm_pager_deallocate(pager); + else + vm_object_deallocate(object); + goto out; + } + /* + * Don't cache anonymous objects. + * Loses the reference gained by vm_pager_allocate. + * Note that object will be NULL when handle == NULL, + * this is ok since vm_allocate_with_pager has made + * sure that these objects are uncached. + */ + (void) pager_cache(object, FALSE); +#ifdef DEBUG + if (mmapdebug & MDB_MAPIT) + printf("vm_mmap(%d): ANON *addr %x size %x pager %x\n", + curproc->p_pid, *addr, size, pager); +#endif + } + /* + * Must be a mapped file. + * Distinguish between character special and regular files. + */ + else if (vp->v_type == VCHR) { + rv = vm_allocate_with_pager(map, addr, size, fitit, + pager, foff, FALSE); + /* + * Uncache the object and lose the reference gained + * by vm_pager_allocate(). If the call to + * vm_allocate_with_pager() was sucessful, then we + * gained an additional reference ensuring the object + * will continue to exist. If the call failed then + * the deallocate call below will terminate the + * object which is fine. + */ + (void) pager_cache(object, FALSE); + if (rv != KERN_SUCCESS) + goto out; + } + /* + * A regular file + */ + else { +#ifdef DEBUG + if (object == NULL) + printf("vm_mmap: no object: vp %x, pager %x\n", + vp, pager); +#endif + /* + * Map it directly. + * Allows modifications to go out to the vnode. + */ + if (flags & MAP_SHARED) { + rv = vm_allocate_with_pager(map, addr, size, + fitit, pager, + foff, FALSE); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + goto out; + } + /* + * Don't cache the object. This is the easiest way + * of ensuring that data gets back to the filesystem + * because vnode_pager_deallocate() will fsync the + * vnode. pager_cache() will lose the extra ref. + */ + if (prot & VM_PROT_WRITE) + pager_cache(object, FALSE); + else + vm_object_deallocate(object); + } + /* + * Copy-on-write of file. Two flavors. + * MAP_COPY is true COW, you essentially get a snapshot of + * the region at the time of mapping. MAP_PRIVATE means only + * that your changes are not reflected back to the object. + * Changes made by others will be seen. + */ + else { + vm_map_t tmap; + vm_offset_t off; + + /* locate and allocate the target address space */ + rv = vm_map_find(map, NULL, (vm_offset_t)0, + addr, size, fitit); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + goto out; + } + tmap = vm_map_create(pmap_create(size), VM_MIN_ADDRESS, + VM_MIN_ADDRESS+size, TRUE); + off = VM_MIN_ADDRESS; + rv = vm_allocate_with_pager(tmap, &off, size, + TRUE, pager, + foff, FALSE); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + vm_map_deallocate(tmap); + goto out; + } + /* + * (XXX) + * MAP_PRIVATE implies that we see changes made by + * others. To ensure that we need to guarentee that + * no copy object is created (otherwise original + * pages would be pushed to the copy object and we + * would never see changes made by others). We + * totally sleeze it right now by marking the object + * internal temporarily. + */ + if ((flags & MAP_COPY) == 0) + object->flags |= OBJ_INTERNAL; + rv = vm_map_copy(map, tmap, *addr, size, off, + FALSE, FALSE); + object->flags &= ~OBJ_INTERNAL; + /* + * (XXX) + * My oh my, this only gets worse... + * Force creation of a shadow object so that + * vm_map_fork will do the right thing. + */ + if ((flags & MAP_COPY) == 0) { + vm_map_t tmap; + vm_map_entry_t tentry; + vm_object_t tobject; + vm_offset_t toffset; + vm_prot_t tprot; + boolean_t twired, tsu; + + tmap = map; + vm_map_lookup(&tmap, *addr, VM_PROT_WRITE, + &tentry, &tobject, &toffset, + &tprot, &twired, &tsu); + vm_map_lookup_done(tmap, tentry); + } + /* + * (XXX) + * Map copy code cannot detect sharing unless a + * sharing map is involved. So we cheat and write + * protect everything ourselves. + */ + vm_object_pmap_copy(object, foff, foff + size); + vm_object_deallocate(object); + vm_map_deallocate(tmap); + if (rv != KERN_SUCCESS) + goto out; + } +#ifdef DEBUG + if (mmapdebug & MDB_MAPIT) + printf("vm_mmap(%d): FILE *addr %x size %x pager %x\n", + curproc->p_pid, *addr, size, pager); +#endif + } + /* + * Correct protection (default is VM_PROT_ALL). + * If maxprot is different than prot, we must set both explicitly. + */ + rv = KERN_SUCCESS; + if (maxprot != VM_PROT_ALL) + rv = vm_map_protect(map, *addr, *addr+size, maxprot, TRUE); + if (rv == KERN_SUCCESS && prot != maxprot) + rv = vm_map_protect(map, *addr, *addr+size, prot, FALSE); + if (rv != KERN_SUCCESS) { + (void) vm_deallocate(map, *addr, size); + goto out; + } + /* + * Shared memory is also shared with children. + */ + if (flags & MAP_SHARED) { + rv = vm_map_inherit(map, *addr, *addr+size, VM_INHERIT_SHARE); + if (rv != KERN_SUCCESS) { + (void) vm_deallocate(map, *addr, size); + goto out; + } + } +out: +#ifdef DEBUG + if (mmapdebug & MDB_MAPIT) + printf("vm_mmap: rv %d\n", rv); +#endif + switch (rv) { + case KERN_SUCCESS: + return (0); + case KERN_INVALID_ADDRESS: + case KERN_NO_SPACE: + return (ENOMEM); + case KERN_PROTECTION_FAILURE: + return (EACCES); + default: + return (EINVAL); + } +} diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c new file mode 100644 index 0000000..a6419dc --- /dev/null +++ b/sys/vm/vm_object.c @@ -0,0 +1,1645 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_object.c 8.5 (Berkeley) 3/22/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory object module. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +static void _vm_object_allocate(vm_size_t, vm_object_t); +void vm_object_deactivate_pages(vm_object_t); +void vm_object_cache_trim(void); +void vm_object_remove(vm_pager_t); + +/* + * Virtual memory objects maintain the actual data + * associated with allocated virtual memory. A given + * page of memory exists within exactly one object. + * + * An object is only deallocated when all "references" + * are given up. Only one "reference" to a given + * region of an object should be writeable. + * + * Associated with each object is a list of all resident + * memory pages belonging to that object; this list is + * maintained by the "vm_page" module, and locked by the object's + * lock. + * + * Each object also records a "pager" routine which is + * used to retrieve (and store) pages to the proper backing + * storage. In addition, objects may be backed by other + * objects from which they were virtual-copied. + * + * The only items within the object structure which are + * modified after time of creation are: + * reference count locked by object's lock + * pager routine locked by object's lock + * + */ + + +struct vm_object kernel_object_store; +struct vm_object kmem_object_store; + +extern int vm_cache_max; +#define VM_OBJECT_HASH_COUNT 157 + +struct vm_object_hash_head vm_object_hashtable[VM_OBJECT_HASH_COUNT]; + +long object_collapses = 0; +long object_bypasses = 0; + +static void +_vm_object_allocate(size, object) + vm_size_t size; + register vm_object_t object; +{ + bzero(object, sizeof *object); + TAILQ_INIT(&object->memq); + vm_object_lock_init(object); + object->ref_count = 1; + object->resident_page_count = 0; + object->size = size; + object->flags = OBJ_INTERNAL; /* vm_allocate_with_pager will reset */ + object->paging_in_progress = 0; + object->copy = NULL; + + /* + * Object starts out read-write, with no pager. + */ + + object->pager = NULL; + object->paging_offset = 0; + object->shadow = NULL; + object->shadow_offset = (vm_offset_t) 0; + + simple_lock(&vm_object_list_lock); + TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); + vm_object_count++; + cnt.v_nzfod += atop(size); + simple_unlock(&vm_object_list_lock); +} + +/* + * vm_object_init: + * + * Initialize the VM objects module. + */ +void +vm_object_init(vm_offset_t nothing) +{ + register int i; + + TAILQ_INIT(&vm_object_cached_list); + TAILQ_INIT(&vm_object_list); + vm_object_count = 0; + simple_lock_init(&vm_cache_lock); + simple_lock_init(&vm_object_list_lock); + + for (i = 0; i < VM_OBJECT_HASH_COUNT; i++) + TAILQ_INIT(&vm_object_hashtable[i]); + + kernel_object = &kernel_object_store; + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + kernel_object); + + kmem_object = &kmem_object_store; + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + kmem_object); +} + +/* + * vm_object_allocate: + * + * Returns a new object with the given size. + */ + +vm_object_t +vm_object_allocate(size) + vm_size_t size; +{ + register vm_object_t result; + int s; + + result = (vm_object_t) + malloc((u_long)sizeof *result, M_VMOBJ, M_WAITOK); + + + _vm_object_allocate(size, result); + + return(result); +} + + +/* + * vm_object_reference: + * + * Gets another reference to the given object. + */ +inline void +vm_object_reference(object) + register vm_object_t object; +{ + if (object == NULL) + return; + + vm_object_lock(object); + object->ref_count++; + vm_object_unlock(object); +} + +/* + * vm_object_deallocate: + * + * Release a reference to the specified object, + * gained either through a vm_object_allocate + * or a vm_object_reference call. When all references + * are gone, storage associated with this object + * may be relinquished. + * + * No object may be locked. + */ +void +vm_object_deallocate(object) + vm_object_t object; +{ + vm_object_t temp; + + while (object != NULL) { + + /* + * The cache holds a reference (uncounted) to + * the object; we must lock it before removing + * the object. + */ + + vm_object_cache_lock(); + + /* + * Lose the reference + */ + vm_object_lock(object); + if (--(object->ref_count) != 0) { + + vm_object_unlock(object); + /* + * If there are still references, then + * we are done. + */ + vm_object_cache_unlock(); + return; + } + + /* + * See if this object can persist. If so, enter + * it in the cache, then deactivate all of its + * pages. + */ + + if (object->flags & OBJ_CANPERSIST) { + + TAILQ_INSERT_TAIL(&vm_object_cached_list, object, + cached_list); + vm_object_cached++; + vm_object_cache_unlock(); + +/* + * this code segment was removed because it kills performance with + * large -- repetively used binaries. The functionality now resides + * in the pageout daemon + * vm_object_deactivate_pages(object); + */ + vm_object_unlock(object); + + vm_object_cache_trim(); + return; + } + + /* + * Make sure no one can look us up now. + */ + vm_object_remove(object->pager); + vm_object_cache_unlock(); + + temp = object->shadow; + vm_object_terminate(object); + /* unlocks and deallocates object */ + object = temp; + } +} + +/* + * vm_object_terminate actually destroys the specified object, freeing + * up all previously used resources. + * + * The object must be locked. + */ +void +vm_object_terminate(object) + register vm_object_t object; +{ + register vm_page_t p; + vm_object_t shadow_object; + int s; + + /* + * Detach the object from its shadow if we are the shadow's + * copy. + */ + if ((shadow_object = object->shadow) != NULL) { + vm_object_lock(shadow_object); + if (shadow_object->copy == object) + shadow_object->copy = NULL; +/* + else if (shadow_object->copy != NULL) + panic("vm_object_terminate: copy/shadow inconsistency"); +*/ + vm_object_unlock(shadow_object); + } + + /* + * Wait until the pageout daemon is through + * with the object. + */ + + while (object->paging_in_progress) { + vm_object_sleep((int)object, object, FALSE); + vm_object_lock(object); + } + + /* + * While the paging system is locked, + * pull the object's pages off the active + * and inactive queues. This keeps the + * pageout daemon from playing with them + * during vm_pager_deallocate. + * + * We can't free the pages yet, because the + * object's pager may have to write them out + * before deallocating the paging space. + */ + + for( p = object->memq.tqh_first; p; p=p->listq.tqe_next) { + VM_PAGE_CHECK(p); + + vm_page_lock_queues(); + s = splimp(); + if (p->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, p, pageq); + p->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + + if (p->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, p, pageq); + p->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + } + splx(s); + vm_page_unlock_queues(); + } + + vm_object_unlock(object); + + if (object->paging_in_progress != 0) + panic("vm_object_deallocate: pageout in progress"); + + /* + * Clean and free the pages, as appropriate. + * All references to the object are gone, + * so we don't need to lock it. + */ + + if ((object->flags & OBJ_INTERNAL) == 0) { + vm_object_lock(object); + (void) vm_object_page_clean(object, 0, 0, TRUE, TRUE); + vm_object_unlock(object); + } + + /* + * Now free the pages. + * For internal objects, this also removes them from paging queues. + */ + while ((p = object->memq.tqh_first) != NULL) { + VM_PAGE_CHECK(p); + vm_page_lock_queues(); + vm_page_free(p); + cnt.v_pfree++; + vm_page_unlock_queues(); + } + + /* + * Let the pager know object is dead. + */ + + if (object->pager != NULL) + vm_pager_deallocate(object->pager); + + + simple_lock(&vm_object_list_lock); + TAILQ_REMOVE(&vm_object_list, object, object_list); + vm_object_count--; + simple_unlock(&vm_object_list_lock); + + /* + * Free the space for the object. + */ + + free((caddr_t)object, M_VMOBJ); +} + +/* + * vm_object_page_clean + * + * Clean all dirty pages in the specified range of object. + * Leaves page on whatever queue it is currently on. + * + * Odd semantics: if start == end, we clean everything. + * + * The object must be locked. + */ +#if 1 +boolean_t +vm_object_page_clean(object, start, end, syncio, de_queue) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; + boolean_t syncio; + boolean_t de_queue; +{ + register vm_page_t p, nextp; + int s; + int size; + + if (object->pager == NULL) + return 1; + + if (start != end) { + start = trunc_page(start); + end = round_page(end); + } + size = end - start; + +again: + /* + * Wait until the pageout daemon is through with the object. + */ + while (object->paging_in_progress) { + vm_object_sleep((int)object, object, FALSE); + } + + nextp = object->memq.tqh_first; + while ( (p = nextp) && ((start == end) || (size != 0) ) ) { + nextp = p->listq.tqe_next; + if (start == end || (p->offset >= start && p->offset < end)) { + if (p->flags & PG_BUSY) + continue; + + size -= PAGE_SIZE; + + if ((p->flags & PG_CLEAN) + && pmap_is_modified(VM_PAGE_TO_PHYS(p))) + p->flags &= ~PG_CLEAN; + + if ((p->flags & PG_CLEAN) == 0) { + vm_pageout_clean(p,VM_PAGEOUT_FORCE); + goto again; + } + } + } + wakeup((caddr_t)object); + return 1; +} +#endif +/* + * vm_object_page_clean + * + * Clean all dirty pages in the specified range of object. + * If syncio is TRUE, page cleaning is done synchronously. + * If de_queue is TRUE, pages are removed from any paging queue + * they were on, otherwise they are left on whatever queue they + * were on before the cleaning operation began. + * + * Odd semantics: if start == end, we clean everything. + * + * The object must be locked. + * + * Returns TRUE if all was well, FALSE if there was a pager error + * somewhere. We attempt to clean (and dequeue) all pages regardless + * of where an error occurs. + */ +#if 0 +boolean_t +vm_object_page_clean(object, start, end, syncio, de_queue) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; + boolean_t syncio; + boolean_t de_queue; +{ + register vm_page_t p; + int onqueue; + boolean_t noerror = TRUE; + + if (object == NULL) + return (TRUE); + + /* + * If it is an internal object and there is no pager, attempt to + * allocate one. Note that vm_object_collapse may relocate one + * from a collapsed object so we must recheck afterward. + */ + if ((object->flags & OBJ_INTERNAL) && object->pager == NULL) { + vm_object_collapse(object); + if (object->pager == NULL) { + vm_pager_t pager; + + vm_object_unlock(object); + pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, + object->size, VM_PROT_ALL, + (vm_offset_t)0); + if (pager) + vm_object_setpager(object, pager, 0, FALSE); + vm_object_lock(object); + } + } + if (object->pager == NULL) + return (FALSE); + +again: + /* + * Wait until the pageout daemon is through with the object. + */ + while (object->paging_in_progress) { + vm_object_sleep((int)object, object, FALSE); + vm_object_lock(object); + } + /* + * Loop through the object page list cleaning as necessary. + */ + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + onqueue = 0; + if ((start == end || p->offset >= start && p->offset < end) && + !(p->flags & PG_FICTITIOUS)) { + if ((p->flags & PG_CLEAN) && + pmap_is_modified(VM_PAGE_TO_PHYS(p))) + p->flags &= ~PG_CLEAN; + /* + * Remove the page from any paging queue. + * This needs to be done if either we have been + * explicitly asked to do so or it is about to + * be cleaned (see comment below). + */ + if (de_queue || !(p->flags & PG_CLEAN)) { + vm_page_lock_queues(); + if (p->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, + p, pageq); + p->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + onqueue = 1; + } else if (p->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, + p, pageq); + p->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + onqueue = -1; + } else + onqueue = 0; + vm_page_unlock_queues(); + } + /* + * To ensure the state of the page doesn't change + * during the clean operation we do two things. + * First we set the busy bit and write-protect all + * mappings to ensure that write accesses to the + * page block (in vm_fault). Second, we remove + * the page from any paging queue to foil the + * pageout daemon (vm_pageout_scan). + */ + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_READ); + if (!(p->flags & PG_CLEAN)) { + p->flags |= PG_BUSY; + object->paging_in_progress++; + vm_object_unlock(object); + /* + * XXX if put fails we mark the page as + * clean to avoid an infinite loop. + * Will loose changes to the page. + */ + if (vm_pager_put(object->pager, p, syncio)) { + printf("%s: pager_put error\n", + "vm_object_page_clean"); + p->flags |= PG_CLEAN; + noerror = FALSE; + } + vm_object_lock(object); + object->paging_in_progress--; + if (!de_queue && onqueue) { + vm_page_lock_queues(); + if (onqueue > 0) + vm_page_activate(p); + else + vm_page_deactivate(p); + vm_page_unlock_queues(); + } + p->flags &= ~PG_BUSY; + PAGE_WAKEUP(p); + goto again; + } + } + } + return (noerror); +} +#endif + +/* + * vm_object_deactivate_pages + * + * Deactivate all pages in the specified object. (Keep its pages + * in memory even though it is no longer referenced.) + * + * The object must be locked. + */ +void +vm_object_deactivate_pages(object) + register vm_object_t object; +{ + register vm_page_t p, next; + + for (p = object->memq.tqh_first; p != NULL; p = next) { + next = p->listq.tqe_next; + vm_page_lock_queues(); + vm_page_deactivate(p); + vm_page_unlock_queues(); + } +} + +/* + * Trim the object cache to size. + */ +void +vm_object_cache_trim() +{ + register vm_object_t object; + + vm_object_cache_lock(); + while (vm_object_cached > vm_cache_max) { + object = vm_object_cached_list.tqh_first; + vm_object_cache_unlock(); + + if (object != vm_object_lookup(object->pager)) + panic("vm_object_deactivate: I'm sooo confused."); + + pager_cache(object, FALSE); + + vm_object_cache_lock(); + } + vm_object_cache_unlock(); +} + + +/* + * vm_object_pmap_copy: + * + * Makes all physical pages in the specified + * object range copy-on-write. No writeable + * references to these pages should remain. + * + * The object must *not* be locked. + */ +void vm_object_pmap_copy(object, start, end) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; +{ + register vm_page_t p; + + if (object == NULL) + return; + + vm_object_lock(object); + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + if ((start <= p->offset) && (p->offset < end)) { + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_READ); + p->flags |= PG_COPYONWRITE; + } + } + vm_object_unlock(object); +} + +/* + * vm_object_pmap_remove: + * + * Removes all physical pages in the specified + * object range from all physical maps. + * + * The object must *not* be locked. + */ +void +vm_object_pmap_remove(object, start, end) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; +{ + register vm_page_t p; + + if (object == NULL) + return; + + vm_object_lock(object); +again: + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + if ((start <= p->offset) && (p->offset < end)) { + if (p->flags & PG_BUSY) { + p->flags |= PG_WANTED; + tsleep((caddr_t) p, PVM, "vmopmr", 0); + goto again; + } + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + if ((p->flags & PG_CLEAN) == 0) + p->flags |= PG_LAUNDRY; + } + } + vm_object_unlock(object); +} + +/* + * vm_object_copy: + * + * Create a new object which is a copy of an existing + * object, and mark all of the pages in the existing + * object 'copy-on-write'. The new object has one reference. + * Returns the new object. + * + * May defer the copy until later if the object is not backed + * up by a non-default pager. + */ +void vm_object_copy(src_object, src_offset, size, + dst_object, dst_offset, src_needs_copy) + register vm_object_t src_object; + vm_offset_t src_offset; + vm_size_t size; + vm_object_t *dst_object; /* OUT */ + vm_offset_t *dst_offset; /* OUT */ + boolean_t *src_needs_copy; /* OUT */ +{ + register vm_object_t new_copy; + register vm_object_t old_copy; + vm_offset_t new_start, new_end; + + register vm_page_t p; + + if (src_object == NULL) { + /* + * Nothing to copy + */ + *dst_object = NULL; + *dst_offset = 0; + *src_needs_copy = FALSE; + return; + } + + + /* + * If the object's pager is null_pager or the + * default pager, we don't have to make a copy + * of it. Instead, we set the needs copy flag and + * make a shadow later. + */ + + vm_object_lock(src_object); + + /* + * Try to collapse the object before copying it. + */ + + vm_object_collapse(src_object); + + if (src_object->pager == NULL || + src_object->pager->pg_type == PG_SWAP || + (src_object->flags & OBJ_INTERNAL)) { + + /* + * Make another reference to the object + */ + src_object->ref_count++; + + /* + * Mark all of the pages copy-on-write. + */ + for (p = src_object->memq.tqh_first; p; p = p->listq.tqe_next) + if (src_offset <= p->offset && + p->offset < src_offset + size) + p->flags |= PG_COPYONWRITE; + vm_object_unlock(src_object); + + *dst_object = src_object; + *dst_offset = src_offset; + + /* + * Must make a shadow when write is desired + */ + *src_needs_copy = TRUE; + return; + } + + + /* + * If the object has a pager, the pager wants to + * see all of the changes. We need a copy-object + * for the changed pages. + * + * If there is a copy-object, and it is empty, + * no changes have been made to the object since the + * copy-object was made. We can use the same copy- + * object. + */ + + Retry1: + old_copy = src_object->copy; + if (old_copy != NULL) { + /* + * Try to get the locks (out of order) + */ + if (!vm_object_lock_try(old_copy)) { + vm_object_unlock(src_object); + + /* should spin a bit here... */ + vm_object_lock(src_object); + goto Retry1; + } + + if (old_copy->resident_page_count == 0 && + old_copy->pager == NULL) { + /* + * Return another reference to + * the existing copy-object. + */ + old_copy->ref_count++; + vm_object_unlock(old_copy); + vm_object_unlock(src_object); + *dst_object = old_copy; + *dst_offset = src_offset; + *src_needs_copy = FALSE; + return; + } + vm_object_unlock(old_copy); + } + vm_object_unlock(src_object); + + /* + * If the object has a pager, the pager wants + * to see all of the changes. We must make + * a copy-object and put the changed pages there. + * + * The copy-object is always made large enough to + * completely shadow the original object, since + * it may have several users who want to shadow + * the original object at different points. + */ + + new_copy = vm_object_allocate(src_object->size); + + Retry2: + vm_object_lock(src_object); + /* + * Copy object may have changed while we were unlocked + */ + old_copy = src_object->copy; + if (old_copy != NULL) { + /* + * Try to get the locks (out of order) + */ + if (!vm_object_lock_try(old_copy)) { + vm_object_unlock(src_object); + goto Retry2; + } + + /* + * Consistency check + */ + if (old_copy->shadow != src_object || + old_copy->shadow_offset != (vm_offset_t) 0) + panic("vm_object_copy: copy/shadow inconsistency"); + + /* + * Make the old copy-object shadow the new one. + * It will receive no more pages from the original + * object. + */ + + src_object->ref_count--; /* remove ref. from old_copy */ + old_copy->shadow = new_copy; + new_copy->ref_count++; /* locking not needed - we + have the only pointer */ + vm_object_unlock(old_copy); /* done with old_copy */ + } + + new_start = (vm_offset_t) 0; /* always shadow original at 0 */ + new_end = (vm_offset_t) new_copy->size; /* for the whole object */ + + /* + * Point the new copy at the existing object. + */ + + new_copy->shadow = src_object; + new_copy->shadow_offset = new_start; + src_object->ref_count++; + src_object->copy = new_copy; + + /* + * Mark all the affected pages of the existing object + * copy-on-write. + */ + for (p = src_object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) + if ((new_start <= p->offset) && (p->offset < new_end)) + p->flags |= PG_COPYONWRITE; + + vm_object_unlock(src_object); + + *dst_object = new_copy; + *dst_offset = src_offset - new_start; + *src_needs_copy = FALSE; +} + +/* + * vm_object_shadow: + * + * Create a new object which is backed by the + * specified existing object range. The source + * object reference is deallocated. + * + * The new object and offset into that object + * are returned in the source parameters. + */ + +void +vm_object_shadow(object, offset, length) + vm_object_t *object; /* IN/OUT */ + vm_offset_t *offset; /* IN/OUT */ + vm_size_t length; +{ + register vm_object_t source; + register vm_object_t result; + + source = *object; + + /* + * Allocate a new object with the given length + */ + + if ((result = vm_object_allocate(length)) == NULL) + panic("vm_object_shadow: no object for shadowing"); + + /* + * The new object shadows the source object, adding + * a reference to it. Our caller changes his reference + * to point to the new object, removing a reference to + * the source object. Net result: no change of reference + * count. + */ + result->shadow = source; + + /* + * Store the offset into the source object, + * and fix up the offset into the new object. + */ + + result->shadow_offset = *offset; + + /* + * Return the new things + */ + + *offset = 0; + *object = result; +} + +/* + * Set the specified object's pager to the specified pager. + */ + +void +vm_object_setpager(object, pager, paging_offset, + read_only) + vm_object_t object; + vm_pager_t pager; + vm_offset_t paging_offset; + boolean_t read_only; +{ +#ifdef lint + read_only++; /* No longer used */ +#endif lint + + vm_object_lock(object); /* XXX ? */ + if (object->pager && object->pager != pager) { + panic("!!!pager already allocated!!!\n"); + } + object->pager = pager; + object->paging_offset = paging_offset; + vm_object_unlock(object); /* XXX ? */ +} + +/* + * vm_object_hash hashes the pager/id pair. + */ + +#define vm_object_hash(pager) \ + (((unsigned)pager >> 5)%VM_OBJECT_HASH_COUNT) + +/* + * vm_object_lookup looks in the object cache for an object with the + * specified pager and paging id. + */ + +vm_object_t vm_object_lookup(pager) + vm_pager_t pager; +{ + register vm_object_hash_entry_t entry; + vm_object_t object; + + vm_object_cache_lock(); + + for (entry = vm_object_hashtable[vm_object_hash(pager)].tqh_first; + entry != NULL; + entry = entry->hash_links.tqe_next) { + object = entry->object; + if (object->pager == pager) { + vm_object_lock(object); + if (object->ref_count == 0) { + TAILQ_REMOVE(&vm_object_cached_list, object, + cached_list); + vm_object_cached--; + } + object->ref_count++; + vm_object_unlock(object); + vm_object_cache_unlock(); + return(object); + } + } + + vm_object_cache_unlock(); + return(NULL); +} + +/* + * vm_object_enter enters the specified object/pager/id into + * the hash table. + */ + +void vm_object_enter(object, pager) + vm_object_t object; + vm_pager_t pager; +{ + struct vm_object_hash_head *bucket; + register vm_object_hash_entry_t entry; + + /* + * We don't cache null objects, and we can't cache + * objects with the null pager. + */ + + if (object == NULL) + return; + if (pager == NULL) + return; + + bucket = &vm_object_hashtable[vm_object_hash(pager)]; + entry = (vm_object_hash_entry_t) + malloc((u_long)sizeof *entry, M_VMOBJHASH, M_WAITOK); + entry->object = object; + object->flags |= OBJ_CANPERSIST; + + vm_object_cache_lock(); + TAILQ_INSERT_TAIL(bucket, entry, hash_links); + vm_object_cache_unlock(); +} + +/* + * vm_object_remove: + * + * Remove the pager from the hash table. + * Note: This assumes that the object cache + * is locked. XXX this should be fixed + * by reorganizing vm_object_deallocate. + */ +void +vm_object_remove(pager) + register vm_pager_t pager; +{ + struct vm_object_hash_head *bucket; + register vm_object_hash_entry_t entry; + register vm_object_t object; + + bucket = &vm_object_hashtable[vm_object_hash(pager)]; + + for (entry = bucket->tqh_first; + entry != NULL; + entry = entry->hash_links.tqe_next) { + object = entry->object; + if (object->pager == pager) { + TAILQ_REMOVE(bucket, entry, hash_links); + free((caddr_t)entry, M_VMOBJHASH); + break; + } + } +} + +boolean_t vm_object_collapse_allowed = TRUE; +/* + * vm_object_collapse: + * + * Collapse an object with the object backing it. + * Pages in the backing object are moved into the + * parent, and the backing object is deallocated. + * + * Requires that the object be locked and the page + * queues be unlocked. + * + * This routine has significant changes by John S. Dyson + * to fix some swap memory leaks. 18 Dec 93 + * + */ +void +vm_object_collapse(object) + register vm_object_t object; + +{ + register vm_object_t backing_object; + register vm_offset_t backing_offset; + register vm_size_t size; + register vm_offset_t new_offset; + register vm_page_t p, pp; + + if (!vm_object_collapse_allowed) + return; + + while (TRUE) { + /* + * Verify that the conditions are right for collapse: + * + * The object exists and no pages in it are currently + * being paged out. + */ + if (object == NULL || + object->paging_in_progress != 0) + return; + + /* + * There is a backing object, and + */ + + if ((backing_object = object->shadow) == NULL) + return; + + vm_object_lock(backing_object); + /* + * ... + * The backing object is not read_only, + * and no pages in the backing object are + * currently being paged out. + * The backing object is internal. + */ + + if ((backing_object->flags & OBJ_INTERNAL) == 0 || + backing_object->paging_in_progress != 0) { + vm_object_unlock(backing_object); + return; + } + + /* + * The backing object can't be a copy-object: + * the shadow_offset for the copy-object must stay + * as 0. Furthermore (for the 'we have all the + * pages' case), if we bypass backing_object and + * just shadow the next object in the chain, old + * pages from that object would then have to be copied + * BOTH into the (former) backing_object and into the + * parent object. + */ + if (backing_object->shadow != NULL && + backing_object->shadow->copy == backing_object) { + vm_object_unlock(backing_object); + return; + } + + /* + * we can deal only with the swap pager + */ + if ((object->pager && + object->pager->pg_type != PG_SWAP) || + (backing_object->pager && + backing_object->pager->pg_type != PG_SWAP)) { + vm_object_unlock(backing_object); + return; + } + + + /* + * We know that we can either collapse the backing + * object (if the parent is the only reference to + * it) or (perhaps) remove the parent's reference + * to it. + */ + + backing_offset = object->shadow_offset; + size = object->size; + + /* + * If there is exactly one reference to the backing + * object, we can collapse it into the parent. + */ + + if (backing_object->ref_count == 1) { + + /* + * We can collapse the backing object. + * + * Move all in-memory pages from backing_object + * to the parent. Pages that have been paged out + * will be overwritten by any of the parent's + * pages that shadow them. + */ + + while (p = backing_object->memq.tqh_first) { + + new_offset = (p->offset - backing_offset); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * dispose of it. + * + * Otherwise, move it as planned. + */ + + if (p->offset < backing_offset || + new_offset >= size) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } else { + pp = vm_page_lookup(object, new_offset); + if (pp != NULL || (object->pager && vm_pager_has_page(object->pager, + object->paging_offset + new_offset))) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } else { + vm_page_rename(p, object, new_offset); + } + } + } + + /* + * Move the pager from backing_object to object. + */ + + if (backing_object->pager) { + backing_object->paging_in_progress++; + if (object->pager) { + vm_pager_t bopager; + object->paging_in_progress++; + /* + * copy shadow object pages into ours + * and destroy unneeded pages in shadow object. + */ + bopager = backing_object->pager; + backing_object->pager = NULL; + vm_object_remove(backing_object->pager); + swap_pager_copy( + bopager, backing_object->paging_offset, + object->pager, object->paging_offset, + object->shadow_offset); + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t)object); + } else { + object->paging_in_progress++; + /* + * grab the shadow objects pager + */ + object->pager = backing_object->pager; + object->paging_offset = backing_object->paging_offset + backing_offset; + vm_object_remove(backing_object->pager); + backing_object->pager = NULL; + /* + * free unnecessary blocks + */ + swap_pager_freespace(object->pager, 0, object->paging_offset); + object->paging_in_progress--; + if (object->paging_in_progress == 0) + wakeup((caddr_t)object); + } + backing_object->paging_in_progress--; + if (backing_object->paging_in_progress == 0) + wakeup((caddr_t)backing_object); + } + + + /* + * Object now shadows whatever backing_object did. + * Note that the reference to backing_object->shadow + * moves from within backing_object to within object. + */ + + object->shadow = backing_object->shadow; + object->shadow_offset += backing_object->shadow_offset; + if (object->shadow != NULL && + object->shadow->copy != NULL) { + panic("vm_object_collapse: we collapsed a copy-object!"); + } + /* + * Discard backing_object. + * + * Since the backing object has no pages, no + * pager left, and no object references within it, + * all that is necessary is to dispose of it. + */ + + vm_object_unlock(backing_object); + + simple_lock(&vm_object_list_lock); + TAILQ_REMOVE(&vm_object_list, backing_object, + object_list); + vm_object_count--; + simple_unlock(&vm_object_list_lock); + + free((caddr_t)backing_object, M_VMOBJ); + + object_collapses++; + } + else { + /* + * If all of the pages in the backing object are + * shadowed by the parent object, the parent + * object no longer has to shadow the backing + * object; it can shadow the next one in the + * chain. + * + * The backing object must not be paged out - we'd + * have to check all of the paged-out pages, as + * well. + */ + + if (backing_object->pager != NULL) { + vm_object_unlock(backing_object); + return; + } + + /* + * Should have a check for a 'small' number + * of pages here. + */ + + for( p = backing_object->memq.tqh_first;p;p=p->listq.tqe_next) { + new_offset = (p->offset - backing_offset); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * keep going. + * + * Otherwise, the backing_object must be + * left in the chain. + */ + + if (p->offset >= backing_offset && + new_offset <= size && + ((pp = vm_page_lookup(object, new_offset)) == NULL || (pp->flags & PG_FAKE)) && + (!object->pager || !vm_pager_has_page(object->pager, object->paging_offset+new_offset))) { + /* + * Page still needed. + * Can't go any further. + */ + vm_object_unlock(backing_object); + return; + } + } + + /* + * Make the parent shadow the next object + * in the chain. Deallocating backing_object + * will not remove it, since its reference + * count is at least 2. + */ + + vm_object_reference(object->shadow = backing_object->shadow); + object->shadow_offset += backing_object->shadow_offset; + + /* + * Backing object might have had a copy pointer + * to us. If it did, clear it. + */ + if (backing_object->copy == object) { + backing_object->copy = NULL; + } + + /* Drop the reference count on backing_object. + * Since its ref_count was at least 2, it + * will not vanish; so we don't need to call + * vm_object_deallocate. + */ + if (backing_object->ref_count == 1) + printf("should have called obj deallocate\n"); + backing_object->ref_count--; + vm_object_unlock(backing_object); + + object_bypasses ++; + + } + + /* + * Try again with this object's new backing object. + */ + } +} + +/* + * vm_object_page_remove: [internal] + * + * Removes all physical pages in the specified + * object range from the object's list of pages. + * + * The object must be locked. + */ +void +vm_object_page_remove(object, start, end) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; +{ + register vm_page_t p, next; + vm_offset_t size; + int cnt; + int s; + + if (object == NULL) + return; + + start = trunc_page(start); + end = round_page(end); +again: + size = end-start; + if (size > 4*PAGE_SIZE || size >= object->size/4) { + for (p = object->memq.tqh_first; (p != NULL && size > 0); p = next) { + next = p->listq.tqe_next; + if ((start <= p->offset) && (p->offset < end)) { + if (p->flags & PG_BUSY) { + p->flags |= PG_WANTED; + tsleep((caddr_t) p, PVM, "vmopar", 0); + goto again; + } + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + size -= PAGE_SIZE; + } + } + } else { + while (size > 0) { + while (p = vm_page_lookup(object, start)) { + if (p->flags & PG_BUSY) { + p->flags |= PG_WANTED; + tsleep((caddr_t) p, PVM, "vmopar", 0); + goto again; + } + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } + start += PAGE_SIZE; + size -= PAGE_SIZE; + } + } +} + +/* + * Routine: vm_object_coalesce + * Function: Coalesces two objects backing up adjoining + * regions of memory into a single object. + * + * returns TRUE if objects were combined. + * + * NOTE: Only works at the moment if the second object is NULL - + * if it's not, which object do we lock first? + * + * Parameters: + * prev_object First object to coalesce + * prev_offset Offset into prev_object + * next_object Second object into coalesce + * next_offset Offset into next_object + * + * prev_size Size of reference to prev_object + * next_size Size of reference to next_object + * + * Conditions: + * The object must *not* be locked. + */ +boolean_t vm_object_coalesce(prev_object, next_object, + prev_offset, next_offset, + prev_size, next_size) + + register vm_object_t prev_object; + vm_object_t next_object; + vm_offset_t prev_offset, next_offset; + vm_size_t prev_size, next_size; +{ + vm_size_t newsize; + +#ifdef lint + next_offset++; +#endif + + if (next_object != NULL) { + return(FALSE); + } + + if (prev_object == NULL) { + return(TRUE); + } + + vm_object_lock(prev_object); + + /* + * Try to collapse the object first + */ + vm_object_collapse(prev_object); + + /* + * Can't coalesce if: + * . more than one reference + * . paged out + * . shadows another object + * . has a copy elsewhere + * (any of which mean that the pages not mapped to + * prev_entry may be in use anyway) + */ + + if (prev_object->ref_count > 1 || + prev_object->pager != NULL || + prev_object->shadow != NULL || + prev_object->copy != NULL) { + vm_object_unlock(prev_object); + return(FALSE); + } + + /* + * Remove any pages that may still be in the object from + * a previous deallocation. + */ + + vm_object_page_remove(prev_object, + prev_offset + prev_size, + prev_offset + prev_size + next_size); + + /* + * Extend the object if necessary. + */ + newsize = prev_offset + prev_size + next_size; + if (newsize > prev_object->size) + prev_object->size = newsize; + + vm_object_unlock(prev_object); + return(TRUE); +} + +/* + * returns page after looking up in shadow chain + */ + +vm_page_t +vm_object_page_lookup(object, offset) + vm_object_t object; + vm_offset_t offset; +{ + vm_page_t m; + if (!(m=vm_page_lookup(object, offset))) { + if (!object->shadow) + return 0; + else + return vm_object_page_lookup(object->shadow, offset + object->shadow_offset); + } + return m; +} + +#define DEBUG +#if defined(DEBUG) || (NDDB > 0) +/* + * vm_object_print: [ debug ] + */ +void vm_object_print(object, full) + vm_object_t object; + boolean_t full; +{ + register vm_page_t p; + extern indent; + + register int count; + + if (object == NULL) + return; + + iprintf("Object 0x%x: size=0x%x, res=%d, ref=%d, ", + (int) object, (int) object->size, + object->resident_page_count, object->ref_count); + printf("pager=0x%x+0x%x, shadow=(0x%x)+0x%x\n", + (int) object->pager, (int) object->paging_offset, + (int) object->shadow, (int) object->shadow_offset); + printf("cache: next=0x%x, prev=0x%x\n", + object->cached_list.tqe_next, object->cached_list.tqe_prev); + + if (!full) + return; + + indent += 2; + count = 0; + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + if (count == 0) + iprintf("memory:="); + else if (count == 6) { + printf("\n"); + iprintf(" ..."); + count = 0; + } else + printf(","); + count++; + + printf("(off=0x%x,page=0x%x)", p->offset, VM_PAGE_TO_PHYS(p)); + } + if (count != 0) + printf("\n"); + indent -= 2; +} +#endif /* defined(DEBUG) || (NDDB > 0) */ diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h new file mode 100644 index 0000000..5e220ac --- /dev/null +++ b/sys/vm/vm_object.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_object.h 8.3 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory object module definitions. + */ + +#ifndef _VM_OBJECT_ +#define _VM_OBJECT_ + +#include <vm/vm_page.h> +#include <vm/vm_pager.h> + +/* + * Types defined: + * + * vm_object_t Virtual memory object. + */ + +struct vm_object { + struct pglist memq; /* Resident memory */ + TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ + u_short flags; /* see below */ + u_short paging_in_progress; /* Paging (in or out) so + don't collapse or destroy */ + simple_lock_data_t Lock; /* Synchronization */ + int ref_count; /* How many refs?? */ + vm_size_t size; /* Object size */ + int resident_page_count; + /* number of resident pages */ + struct vm_object *copy; /* Object that holds copies of + my changed pages */ + vm_pager_t pager; /* Where to get data */ + vm_offset_t paging_offset; /* Offset into paging space */ + struct vm_object *shadow; /* My shadow */ + vm_offset_t shadow_offset; /* Offset in shadow */ + TAILQ_ENTRY(vm_object) cached_list; /* for persistence */ +}; +/* + * Flags + */ +#define OBJ_CANPERSIST 0x0001 /* allow to persist */ +#define OBJ_INTERNAL 0x0002 /* internally created object */ +#define OBJ_ACTIVE 0x0004 /* used to mark active objects */ + +TAILQ_HEAD(vm_object_hash_head, vm_object_hash_entry); + +struct vm_object_hash_entry { + TAILQ_ENTRY(vm_object_hash_entry) hash_links; /* hash chain links */ + vm_object_t object; /* object represened */ +}; + +typedef struct vm_object_hash_entry *vm_object_hash_entry_t; + +#ifdef KERNEL +TAILQ_HEAD(object_q, vm_object); + +struct object_q vm_object_cached_list; /* list of objects persisting */ +int vm_object_cached; /* size of cached list */ +simple_lock_data_t vm_cache_lock; /* lock for object cache */ + +struct object_q vm_object_list; /* list of allocated objects */ +long vm_object_count; /* count of all objects */ +simple_lock_data_t vm_object_list_lock; + /* lock for object list and count */ + +vm_object_t kernel_object; /* the single kernel object */ +vm_object_t kmem_object; + +#define vm_object_cache_lock() simple_lock(&vm_cache_lock) +#define vm_object_cache_unlock() simple_unlock(&vm_cache_lock) +#endif /* KERNEL */ + +#define vm_object_lock_init(object) simple_lock_init(&(object)->Lock) +#define vm_object_lock(object) simple_lock(&(object)->Lock) +#define vm_object_unlock(object) simple_unlock(&(object)->Lock) +#define vm_object_lock_try(object) simple_lock_try(&(object)->Lock) +#define vm_object_sleep(event, object, interruptible) \ + thread_sleep((event), &(object)->Lock, (interruptible)) + +#ifdef KERNEL +vm_object_t vm_object_allocate __P((vm_size_t)); +void vm_object_cache_clear __P((void)); +void vm_object_cache_trim __P((void)); +boolean_t vm_object_coalesce __P((vm_object_t, vm_object_t, + vm_offset_t, vm_offset_t, vm_offset_t, vm_size_t)); +void vm_object_collapse __P((vm_object_t)); +void vm_object_copy __P((vm_object_t, vm_offset_t, vm_size_t, + vm_object_t *, vm_offset_t *, boolean_t *)); +void vm_object_deactivate_pages __P((vm_object_t)); +void vm_object_deallocate __P((vm_object_t)); +void vm_object_enter __P((vm_object_t, vm_pager_t)); +void vm_object_init __P((vm_size_t)); +vm_object_t vm_object_lookup __P((vm_pager_t)); +boolean_t vm_object_page_clean __P((vm_object_t, + vm_offset_t, vm_offset_t, boolean_t, boolean_t)); +void vm_object_page_remove __P((vm_object_t, + vm_offset_t, vm_offset_t)); +void vm_object_pmap_copy __P((vm_object_t, + vm_offset_t, vm_offset_t)); +void vm_object_pmap_remove __P((vm_object_t, + vm_offset_t, vm_offset_t)); +void vm_object_print __P((vm_object_t, boolean_t)); +void vm_object_reference __P((vm_object_t)); +void vm_object_remove __P((vm_pager_t)); +void vm_object_setpager __P((vm_object_t, + vm_pager_t, vm_offset_t, boolean_t)); +void vm_object_shadow __P((vm_object_t *, + vm_offset_t *, vm_size_t)); +void vm_object_terminate __P((vm_object_t)); +#endif +#endif /* _VM_OBJECT_ */ diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c new file mode 100644 index 0000000..4304100 --- /dev/null +++ b/sys/vm/vm_page.c @@ -0,0 +1,879 @@ +/* + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 + * $Id: vm_page.c,v 1.2 1994/05/25 09:20:05 rgrimes Exp $ + */ + +/* + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Resident memory management module. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_pageout.h> + +/* + * Associated with page of user-allocatable memory is a + * page structure. + */ + +struct pglist *vm_page_buckets; /* Array of buckets */ +int vm_page_bucket_count = 0; /* How big is array? */ +int vm_page_hash_mask; /* Mask for hash function */ +simple_lock_data_t bucket_lock; /* lock for all buckets XXX */ + +struct pglist vm_page_queue_free; +struct pglist vm_page_queue_active; +struct pglist vm_page_queue_inactive; +simple_lock_data_t vm_page_queue_lock; +simple_lock_data_t vm_page_queue_free_lock; + +/* has physical page allocation been initialized? */ +boolean_t vm_page_startup_initialized; + +vm_page_t vm_page_array; +long first_page; +long last_page; +vm_offset_t first_phys_addr; +vm_offset_t last_phys_addr; +vm_size_t page_mask; +int page_shift; + +/* + * vm_set_page_size: + * + * Sets the page size, perhaps based upon the memory + * size. Must be called before any use of page-size + * dependent functions. + * + * Sets page_shift and page_mask from cnt.v_page_size. + */ +void vm_set_page_size() +{ + + if (cnt.v_page_size == 0) + cnt.v_page_size = DEFAULT_PAGE_SIZE; + page_mask = cnt.v_page_size - 1; + if ((page_mask & cnt.v_page_size) != 0) + panic("vm_set_page_size: page size not a power of two"); + for (page_shift = 0; ; page_shift++) + if ((1 << page_shift) == cnt.v_page_size) + break; +} + +/* + * vm_page_startup: + * + * Initializes the resident memory module. + * + * Allocates memory for the page cells, and + * for the object/offset-to-page hash table headers. + * Each page cell is initialized and placed on the free list. + */ + +vm_offset_t +vm_page_startup(starta, enda, vaddr) + register vm_offset_t starta; + vm_offset_t enda; + register vm_offset_t vaddr; +{ + register vm_offset_t mapped; + register vm_page_t m; + register struct pglist *bucket; + vm_size_t npages, page_range; + register vm_offset_t new_start; + int i; + vm_offset_t pa; + int nblocks; + vm_offset_t first_managed_page; + int size; + + extern vm_offset_t kentry_data; + extern vm_size_t kentry_data_size; + extern vm_offset_t phys_avail[]; +/* the biggest memory array is the second group of pages */ + vm_offset_t start; + vm_offset_t biggestone, biggestsize; + + vm_offset_t total; + + total = 0; + biggestsize = 0; + biggestone = 0; + nblocks = 0; + vaddr = round_page(vaddr); + + for (i = 0; phys_avail[i + 1]; i += 2) { + phys_avail[i] = round_page(phys_avail[i]); + phys_avail[i+1] = trunc_page(phys_avail[i+1]); + } + + for (i = 0; phys_avail[i + 1]; i += 2) { + int size = phys_avail[i+1] - phys_avail[i]; + if (size > biggestsize) { + biggestone = i; + biggestsize = size; + } + ++nblocks; + total += size; + } + + start = phys_avail[biggestone]; + + + /* + * Initialize the locks + */ + + simple_lock_init(&vm_page_queue_free_lock); + simple_lock_init(&vm_page_queue_lock); + + /* + * Initialize the queue headers for the free queue, + * the active queue and the inactive queue. + */ + + TAILQ_INIT(&vm_page_queue_free); + TAILQ_INIT(&vm_page_queue_active); + TAILQ_INIT(&vm_page_queue_inactive); + + /* + * Allocate (and initialize) the hash table buckets. + * + * The number of buckets MUST BE a power of 2, and + * the actual value is the next power of 2 greater + * than the number of physical pages in the system. + * + * Note: + * This computation can be tweaked if desired. + */ + vm_page_buckets = (struct pglist *)vaddr; + bucket = vm_page_buckets; + if (vm_page_bucket_count == 0) { + vm_page_bucket_count = 1; + while (vm_page_bucket_count < atop(total)) + vm_page_bucket_count <<= 1; + } + + + vm_page_hash_mask = vm_page_bucket_count - 1; + + /* + * Validate these addresses. + */ + + new_start = start + vm_page_bucket_count * sizeof(struct pglist); + new_start = round_page(new_start); + mapped = vaddr; + vaddr = pmap_map(mapped, start, new_start, + VM_PROT_READ|VM_PROT_WRITE); + start = new_start; + bzero((caddr_t) mapped, vaddr - mapped); + mapped = vaddr; + + for (i = 0; i< vm_page_bucket_count; i++) { + TAILQ_INIT(bucket); + bucket++; + } + + simple_lock_init(&bucket_lock); + + /* + * round (or truncate) the addresses to our page size. + */ + + /* + * Pre-allocate maps and map entries that cannot be dynamically + * allocated via malloc(). The maps include the kernel_map and + * kmem_map which must be initialized before malloc() will + * work (obviously). Also could include pager maps which would + * be allocated before kmeminit. + * + * Allow some kernel map entries... this should be plenty + * since people shouldn't be cluttering up the kernel + * map (they should use their own maps). + */ + + kentry_data_size = MAX_KMAP * sizeof(struct vm_map) + + MAX_KMAPENT * sizeof(struct vm_map_entry); + kentry_data_size = round_page(kentry_data_size); + kentry_data = (vm_offset_t) vaddr; + vaddr += kentry_data_size; + + /* + * Validate these zone addresses. + */ + + new_start = start + (vaddr - mapped); + pmap_map(mapped, start, new_start, VM_PROT_READ|VM_PROT_WRITE); + bzero((caddr_t) mapped, (vaddr - mapped)); + start = round_page(new_start); + + /* + * Compute the number of pages of memory that will be + * available for use (taking into account the overhead + * of a page structure per page). + */ + + npages = (total - (start - phys_avail[biggestone])) / (PAGE_SIZE + sizeof(struct vm_page)); + first_page = phys_avail[0] / PAGE_SIZE; + + page_range = (phys_avail[(nblocks-1)*2 + 1] - phys_avail[0]) / PAGE_SIZE; + /* + * Initialize the mem entry structures now, and + * put them in the free queue. + */ + + vm_page_array = (vm_page_t) vaddr; + mapped = vaddr; + + + /* + * Validate these addresses. + */ + + new_start = round_page(start + page_range * sizeof (struct vm_page)); + mapped = pmap_map(mapped, start, new_start, + VM_PROT_READ|VM_PROT_WRITE); + start = new_start; + + first_managed_page = start / PAGE_SIZE; + + /* + * Clear all of the page structures + */ + bzero((caddr_t)vm_page_array, page_range * sizeof(struct vm_page)); + + cnt.v_page_count = 0; + cnt.v_free_count= 0; + for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { + if (i == biggestone) + pa = ptoa(first_managed_page); + else + pa = phys_avail[i]; + while (pa < phys_avail[i + 1] && npages-- > 0) { + ++cnt.v_page_count; + ++cnt.v_free_count; + m = PHYS_TO_VM_PAGE(pa); + m->flags = 0; + m->object = 0; + m->phys_addr = pa; + m->hold_count = 0; + TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); + pa += PAGE_SIZE; + } + } + + /* + * Initialize vm_pages_needed lock here - don't wait for pageout + * daemon XXX + */ + simple_lock_init(&vm_pages_needed_lock); + + return(mapped); +} + +/* + * vm_page_hash: + * + * Distributes the object/offset key pair among hash buckets. + * + * NOTE: This macro depends on vm_page_bucket_count being a power of 2. + */ +inline const int +vm_page_hash(object, offset) + vm_object_t object; + vm_offset_t offset; +{ + return ((unsigned)object + offset/NBPG) & vm_page_hash_mask; +} + +/* + * vm_page_insert: [ internal use only ] + * + * Inserts the given mem entry into the object/object-page + * table and object list. + * + * The object and page must be locked. + */ + +void vm_page_insert(mem, object, offset) + register vm_page_t mem; + register vm_object_t object; + register vm_offset_t offset; +{ + register struct pglist *bucket; + int s; + + VM_PAGE_CHECK(mem); + + if (mem->flags & PG_TABLED) + panic("vm_page_insert: already inserted"); + + /* + * Record the object/offset pair in this page + */ + + mem->object = object; + mem->offset = offset; + + /* + * Insert it into the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + s = splimp(); + simple_lock(&bucket_lock); + TAILQ_INSERT_TAIL(bucket, mem, hashq); + simple_unlock(&bucket_lock); + (void) splx(s); + + /* + * Now link into the object's list of backed pages. + */ + + TAILQ_INSERT_TAIL(&object->memq, mem, listq); + mem->flags |= PG_TABLED; + + /* + * And show that the object has one more resident + * page. + */ + + object->resident_page_count++; +} + +/* + * vm_page_remove: [ internal use only ] + * NOTE: used by device pager as well -wfj + * + * Removes the given mem entry from the object/offset-page + * table and the object page list. + * + * The object and page must be locked. + */ + +void vm_page_remove(mem) + register vm_page_t mem; +{ + register struct pglist *bucket; + int s; + + VM_PAGE_CHECK(mem); + + if (!(mem->flags & PG_TABLED)) + return; + + /* + * Remove from the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->offset)]; + s = splimp(); + simple_lock(&bucket_lock); + TAILQ_REMOVE(bucket, mem, hashq); + simple_unlock(&bucket_lock); + (void) splx(s); + + /* + * Now remove from the object's list of backed pages. + */ + + TAILQ_REMOVE(&mem->object->memq, mem, listq); + + /* + * And show that the object has one fewer resident + * page. + */ + + mem->object->resident_page_count--; + + mem->flags &= ~PG_TABLED; +} + +/* + * vm_page_lookup: + * + * Returns the page associated with the object/offset + * pair specified; if none is found, NULL is returned. + * + * The object must be locked. No side effects. + */ + +vm_page_t vm_page_lookup(object, offset) + register vm_object_t object; + register vm_offset_t offset; +{ + register vm_page_t mem; + register struct pglist *bucket; + int s; + + /* + * Search the hash table for this object/offset pair + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + + s = splimp(); + simple_lock(&bucket_lock); + for (mem = bucket->tqh_first; mem != NULL; mem = mem->hashq.tqe_next) { + VM_PAGE_CHECK(mem); + if ((mem->object == object) && (mem->offset == offset)) { + simple_unlock(&bucket_lock); + splx(s); + return(mem); + } + } + + simple_unlock(&bucket_lock); + splx(s); + return(NULL); +} + +/* + * vm_page_rename: + * + * Move the given memory entry from its + * current object to the specified target object/offset. + * + * The object must be locked. + */ +void vm_page_rename(mem, new_object, new_offset) + register vm_page_t mem; + register vm_object_t new_object; + vm_offset_t new_offset; +{ + if (mem->object == new_object) + return; + + vm_page_lock_queues(); /* keep page from moving out from + under pageout daemon */ + vm_page_remove(mem); + vm_page_insert(mem, new_object, new_offset); + vm_page_unlock_queues(); +} + +/* + * vm_page_alloc: + * + * Allocate and return a memory cell associated + * with this VM object/offset pair. + * + * Object must be locked. + */ +vm_page_t +vm_page_alloc(object, offset) + vm_object_t object; + vm_offset_t offset; +{ + register vm_page_t mem; + int s; + + s = splimp(); + simple_lock(&vm_page_queue_free_lock); + if ( object != kernel_object && + object != kmem_object && + curproc != pageproc && curproc != &proc0 && + cnt.v_free_count < cnt.v_free_reserved) { + + simple_unlock(&vm_page_queue_free_lock); + splx(s); + /* + * this wakeup seems unnecessary, but there is code that + * might just check to see if there are free pages, and + * punt if there aren't. VM_WAIT does this too, but + * redundant wakeups aren't that bad... + */ + if (curproc != pageproc) + wakeup((caddr_t) &vm_pages_needed); + return(NULL); + } + if (( mem = vm_page_queue_free.tqh_first) == 0) { + simple_unlock(&vm_page_queue_free_lock); + printf("No pages???\n"); + splx(s); + /* + * comment above re: wakeups applies here too... + */ + if (curproc != pageproc) + wakeup((caddr_t) &vm_pages_needed); + return(NULL); + } + + TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); + + cnt.v_free_count--; + simple_unlock(&vm_page_queue_free_lock); + + VM_PAGE_INIT(mem, object, offset); + splx(s); + +/* + * don't wakeup too often, so we wakeup the pageout daemon when + * we would be nearly out of memory. + */ + if (curproc != pageproc && + (cnt.v_free_count < cnt.v_free_reserved)) + wakeup((caddr_t) &vm_pages_needed); + + return(mem); +} + +/* + * vm_page_free: + * + * Returns the given page to the free list, + * disassociating it with any VM object. + * + * Object and page must be locked prior to entry. + */ +void vm_page_free(mem) + register vm_page_t mem; +{ + int s; + s = splimp(); + vm_page_remove(mem); + if (mem->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, mem, pageq); + mem->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + + if (mem->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, mem, pageq); + mem->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + } + + if (!(mem->flags & PG_FICTITIOUS)) { + + simple_lock(&vm_page_queue_free_lock); + if (mem->wire_count) { + cnt.v_wire_count--; + mem->wire_count = 0; + } + TAILQ_INSERT_TAIL(&vm_page_queue_free, mem, pageq); + + cnt.v_free_count++; + simple_unlock(&vm_page_queue_free_lock); + splx(s); + /* + * if pageout daemon needs pages, then tell it that there + * are some free. + */ + if (vm_pageout_pages_needed) + wakeup((caddr_t)&vm_pageout_pages_needed); + + /* + * wakeup processes that are waiting on memory if we + * hit a high water mark. + */ + if (cnt.v_free_count == cnt.v_free_min) { + wakeup((caddr_t)&cnt.v_free_count); + } + + /* + * wakeup scheduler process if we have lots of memory. + * this process will swapin processes. + */ + if (cnt.v_free_count == cnt.v_free_target) { + wakeup((caddr_t)&proc0); + } + } else { + splx(s); + } + wakeup((caddr_t) mem); +} + + +/* + * vm_page_wire: + * + * Mark this page as wired down by yet + * another map, removing it from paging queues + * as necessary. + * + * The page queues must be locked. + */ +void vm_page_wire(mem) + register vm_page_t mem; +{ + int s; + VM_PAGE_CHECK(mem); + + if (mem->wire_count == 0) { + s = splimp(); + if (mem->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, mem, pageq); + cnt.v_active_count--; + mem->flags &= ~PG_ACTIVE; + } + if (mem->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, mem, pageq); + cnt.v_inactive_count--; + mem->flags &= ~PG_INACTIVE; + } + splx(s); + cnt.v_wire_count++; + } + mem->wire_count++; +} + +/* + * vm_page_unwire: + * + * Release one wiring of this page, potentially + * enabling it to be paged again. + * + * The page queues must be locked. + */ +void vm_page_unwire(mem) + register vm_page_t mem; +{ + int s; + VM_PAGE_CHECK(mem); + + s = splimp(); + + if( mem->wire_count) + mem->wire_count--; + if (mem->wire_count == 0) { + TAILQ_INSERT_TAIL(&vm_page_queue_active, mem, pageq); + cnt.v_active_count++; + mem->flags |= PG_ACTIVE; + cnt.v_wire_count--; + } + splx(s); +} + +#if 0 +/* + * vm_page_deactivate: + * + * Returns the given page to the inactive list, + * indicating that no physical maps have access + * to this page. [Used by the physical mapping system.] + * + * The page queues must be locked. + */ +void +vm_page_deactivate(m) + register vm_page_t m; +{ + int spl; + VM_PAGE_CHECK(m); + + /* + * Only move active pages -- ignore locked or already + * inactive ones. + * + * XXX: sometimes we get pages which aren't wired down + * or on any queue - we need to put them on the inactive + * queue also, otherwise we lose track of them. + * Paul Mackerras (paulus@cs.anu.edu.au) 9-Jan-93. + */ + + spl = splimp(); + if (!(m->flags & PG_INACTIVE) && m->wire_count == 0 && + m->hold_count == 0) { + + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + if (m->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + m->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + m->flags |= PG_INACTIVE; + cnt.v_inactive_count++; +#define NOT_DEACTIVATE_PROTECTS +#ifndef NOT_DEACTIVATE_PROTECTS + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); +#else + if ((m->flags & PG_CLEAN) && + pmap_is_modified(VM_PAGE_TO_PHYS(m))) + m->flags &= ~PG_CLEAN; +#endif + if ((m->flags & PG_CLEAN) == 0) + m->flags |= PG_LAUNDRY; + } + splx(spl); +} +#endif +#if 1 +/* + * vm_page_deactivate: + * + * Returns the given page to the inactive list, + * indicating that no physical maps have access + * to this page. [Used by the physical mapping system.] + * + * The page queues must be locked. + */ +void vm_page_deactivate(m) + register vm_page_t m; +{ + int s; + VM_PAGE_CHECK(m); + + s = splimp(); + /* + * Only move active pages -- ignore locked or already + * inactive ones. + */ + + if ((m->flags & PG_ACTIVE) && (m->hold_count == 0)) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + m->flags &= ~PG_ACTIVE; + m->flags |= PG_INACTIVE; + cnt.v_active_count--; + cnt.v_inactive_count++; +#define NOT_DEACTIVATE_PROTECTS +#ifndef NOT_DEACTIVATE_PROTECTS + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); +#else + if (pmap_is_modified(VM_PAGE_TO_PHYS(m))) + m->flags &= ~PG_CLEAN; +#endif + if (m->flags & PG_CLEAN) + m->flags &= ~PG_LAUNDRY; + else + m->flags |= PG_LAUNDRY; + } + splx(s); +} +#endif +/* + * vm_page_activate: + * + * Put the specified page on the active list (if appropriate). + * + * The page queues must be locked. + */ + +void vm_page_activate(m) + register vm_page_t m; +{ + int s; + VM_PAGE_CHECK(m); + + s = splimp(); + if (m->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); + cnt.v_inactive_count--; + m->flags &= ~PG_INACTIVE; + } + if (m->wire_count == 0) { + if (m->flags & PG_ACTIVE) + panic("vm_page_activate: already active"); + + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + m->flags |= PG_ACTIVE; + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + m->act_count = 1; + cnt.v_active_count++; + } + splx(s); +} + +/* + * vm_page_zero_fill: + * + * Zero-fill the specified page. + * Written as a standard pagein routine, to + * be used by the zero-fill object. + */ + +boolean_t +vm_page_zero_fill(m) + vm_page_t m; +{ + VM_PAGE_CHECK(m); + + pmap_zero_page(VM_PAGE_TO_PHYS(m)); + return(TRUE); +} + +/* + * vm_page_copy: + * + * Copy one page to another + */ +void +vm_page_copy(src_m, dest_m) + vm_page_t src_m; + vm_page_t dest_m; +{ + VM_PAGE_CHECK(src_m); + VM_PAGE_CHECK(dest_m); + + pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m)); +} diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h new file mode 100644 index 0000000..e8049c4 --- /dev/null +++ b/sys/vm/vm_page.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_page.h 8.2 (Berkeley) 12/13/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Resident memory system definitions. + */ + +#ifndef _VM_PAGE_ +#define _VM_PAGE_ + +/* + * Management of resident (logical) pages. + * + * A small structure is kept for each resident + * page, indexed by page number. Each structure + * is an element of several lists: + * + * A hash table bucket used to quickly + * perform object/offset lookups + * + * A list of all pages for a given object, + * so they can be quickly deactivated at + * time of deallocation. + * + * An ordered list of pages due for pageout. + * + * In addition, the structure contains the object + * and offset to which this page belongs (for pageout), + * and sundry status bits. + * + * Fields in this structure are locked either by the lock on the + * object that the page belongs to (O) or by the lock on the page + * queues (P). + */ + +TAILQ_HEAD(pglist, vm_page); + +struct vm_page { + TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO + * queue or free list (P) */ + TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ + TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ + + vm_object_t object; /* which object am I in (O,P)*/ + vm_offset_t offset; /* offset into object (O,P) */ + + u_short wire_count; /* wired down maps refs (P) */ + u_short flags; /* see below */ + short hold_count; /* page hold count */ + u_short act_count; /* page usage count */ + + vm_offset_t phys_addr; /* physical address of page */ +}; + +/* + * These are the flags defined for vm_page. + * + * Note: PG_FILLED and PG_DIRTY are added for the filesystems. + */ +#define PG_INACTIVE 0x0001 /* page is in inactive list (P) */ +#define PG_ACTIVE 0x0002 /* page is in active list (P) */ +#define PG_LAUNDRY 0x0004 /* page is being cleaned now (P)*/ +#define PG_CLEAN 0x0008 /* page has not been modified */ +#define PG_BUSY 0x0010 /* page is in transit (O) */ +#define PG_WANTED 0x0020 /* someone is waiting for page (O) */ +#define PG_TABLED 0x0040 /* page is in VP table (O) */ +#define PG_COPYONWRITE 0x0080 /* must copy page before changing (O) */ +#define PG_FICTITIOUS 0x0100 /* physical page doesn't exist (O) */ +#define PG_FAKE 0x0200 /* page is placeholder for pagein (O) */ +#define PG_FILLED 0x0400 /* client flag to set when filled */ +#define PG_DIRTY 0x0800 /* client flag to set when dirty */ +#define PG_PAGEROWNED 0x4000 /* DEBUG: async paging op in progress */ +#define PG_PTPAGE 0x8000 /* DEBUG: is a user page table page */ + +#if VM_PAGE_DEBUG +#define VM_PAGE_CHECK(mem) { \ + if ((((unsigned int) mem) < ((unsigned int) &vm_page_array[0])) || \ + (((unsigned int) mem) > \ + ((unsigned int) &vm_page_array[last_page-first_page])) || \ + ((mem->flags & (PG_ACTIVE | PG_INACTIVE)) == \ + (PG_ACTIVE | PG_INACTIVE))) \ + panic("vm_page_check: not valid!"); \ +} +#else /* VM_PAGE_DEBUG */ +#define VM_PAGE_CHECK(mem) +#endif /* VM_PAGE_DEBUG */ + +#ifdef KERNEL +/* + * Each pageable resident page falls into one of three lists: + * + * free + * Available for allocation now. + * inactive + * Not referenced in any map, but still has an + * object/offset-page mapping, and may be dirty. + * This is the list of pages that should be + * paged out next. + * active + * A list of pages which have been placed in + * at least one physical map. This list is + * ordered, in LRU-like fashion. + */ + +extern +struct pglist vm_page_queue_free; /* memory free queue */ +extern +struct pglist vm_page_queue_active; /* active memory queue */ +extern +struct pglist vm_page_queue_inactive; /* inactive memory queue */ + +extern +vm_page_t vm_page_array; /* First resident page in table */ +extern +long first_page; /* first physical page number */ + /* ... represented in vm_page_array */ +extern +long last_page; /* last physical page number */ + /* ... represented in vm_page_array */ + /* [INCLUSIVE] */ +extern +vm_offset_t first_phys_addr; /* physical address for first_page */ +extern +vm_offset_t last_phys_addr; /* physical address for last_page */ + +#define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) + +#define IS_VM_PHYSADDR(pa) \ + ((pa) >= first_phys_addr && (pa) <= last_phys_addr) + +#define PHYS_TO_VM_PAGE(pa) \ + (&vm_page_array[atop(pa) - first_page ]) + +extern +simple_lock_data_t vm_page_queue_lock; /* lock on active and inactive + page queues */ +extern /* lock on free page queue */ +simple_lock_data_t vm_page_queue_free_lock; + +/* + * Functions implemented as macros + */ + +#define PAGE_ASSERT_WAIT(m, interruptible) { \ + (m)->flags |= PG_WANTED; \ + assert_wait((int) (m), (interruptible)); \ + } + +#define PAGE_WAKEUP(m) { \ + (m)->flags &= ~PG_BUSY; \ + if ((m)->flags & PG_WANTED) { \ + (m)->flags &= ~PG_WANTED; \ + wakeup((caddr_t) (m)); \ + } \ + } + +#define vm_page_lock_queues() simple_lock(&vm_page_queue_lock) +#define vm_page_unlock_queues() simple_unlock(&vm_page_queue_lock) + +#define vm_page_set_modified(m) { (m)->flags &= ~PG_CLEAN; } + +#define VM_PAGE_INIT(mem, object, offset) { \ + (mem)->flags = PG_BUSY | PG_CLEAN | PG_FAKE; \ + vm_page_insert((mem), (object), (offset)); \ + (mem)->wire_count = 0; \ + (mem)->hold_count = 0; \ + (mem)->act_count = 0; \ +} + +void vm_page_activate __P((vm_page_t)); +vm_page_t vm_page_alloc __P((vm_object_t, vm_offset_t)); +void vm_page_copy __P((vm_page_t, vm_page_t)); +void vm_page_deactivate __P((vm_page_t)); +void vm_page_free __P((vm_page_t)); +void vm_page_insert __P((vm_page_t, vm_object_t, vm_offset_t)); +vm_page_t vm_page_lookup __P((vm_object_t, vm_offset_t)); +void vm_page_remove __P((vm_page_t)); +void vm_page_rename __P((vm_page_t, vm_object_t, vm_offset_t)); +vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); +void vm_page_unwire __P((vm_page_t)); +void vm_page_wire __P((vm_page_t)); +boolean_t vm_page_zero_fill __P((vm_page_t)); + + +/* + * Keep page from being freed by the page daemon + * much of the same effect as wiring, except much lower + * overhead and should be used only for *very* temporary + * holding ("wiring"). + */ +static inline void +vm_page_hold(mem) + vm_page_t mem; +{ + mem->hold_count++; +} + +static inline void +vm_page_unhold(mem) + vm_page_t mem; +{ + if( --mem->hold_count < 0) + panic("vm_page_unhold: hold count < 0!!!"); +} + +#endif /* KERNEL */ +#endif /* !_VM_PAGE_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c new file mode 100644 index 0000000..cabb102 --- /dev/null +++ b/sys/vm/vm_pageout.c @@ -0,0 +1,790 @@ +/* + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + * + * $Id: vm_pageout.c,v 1.3 1994/06/06 11:56:27 davidg Exp $ + */ + +/* + * The proverbial page-out daemon. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +extern vm_map_t kmem_map; +int vm_pages_needed; /* Event on which pageout daemon sleeps */ +int vm_pagescanner; /* Event on which pagescanner sleeps */ +int vm_pageout_free_min = 0; /* Stop pageout to wait for pagers at this free level */ + +int vm_pageout_pages_needed = 0; /* flag saying that the pageout daemon needs pages */ +int vm_page_pagesfreed; + +extern int npendingio; +extern int hz; +int vm_pageout_proc_limit; +extern int nswiodone; +extern int swap_pager_full; +extern int swap_pager_ready(); + +#define MAXREF 32767 + +#define MAXSCAN 512 /* maximum number of pages to scan in active queue */ + /* set the "clock" hands to be (MAXSCAN * 4096) Bytes */ +#define ACT_DECLINE 1 +#define ACT_ADVANCE 3 +#define ACT_MAX 300 + +#define LOWATER ((2048*1024)/NBPG) + +#define VM_PAGEOUT_PAGE_COUNT 8 +static vm_offset_t vm_space_needed; +int vm_pageout_req_do_stats; + +int vm_page_max_wired = 0; /* XXX max # of wired pages system-wide */ + + +/* + * vm_pageout_clean: + * cleans a vm_page + */ +int +vm_pageout_clean(m, sync) + register vm_page_t m; + int sync; +{ + /* + * Clean the page and remove it from the + * laundry. + * + * We set the busy bit to cause + * potential page faults on this page to + * block. + * + * And we set pageout-in-progress to keep + * the object from disappearing during + * pageout. This guarantees that the + * page won't move from the inactive + * queue. (However, any other page on + * the inactive queue may move!) + */ + + register vm_object_t object; + register vm_pager_t pager; + int pageout_status[VM_PAGEOUT_PAGE_COUNT]; + vm_page_t ms[VM_PAGEOUT_PAGE_COUNT]; + int pageout_count; + int anyok=0; + int i; + vm_offset_t offset = m->offset; + + object = m->object; + if (!object) { + printf("pager: object missing\n"); + return 0; + } + + /* + * Try to collapse the object before + * making a pager for it. We must + * unlock the page queues first. + * We try to defer the creation of a pager + * until all shadows are not paging. This + * allows vm_object_collapse to work better and + * helps control swap space size. + * (J. Dyson 11 Nov 93) + */ + + if (!object->pager && + cnt.v_free_count < vm_pageout_free_min) + return 0; + + if (!object->pager && + object->shadow && + object->shadow->paging_in_progress) + return 0; + + if( !sync) { + if (object->shadow) { + vm_object_collapse(object); + if (!vm_page_lookup(object, offset)) + return 0; + } + + if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { + return 0; + } + } + + pageout_count = 1; + ms[0] = m; + + if( pager = object->pager) { + for(i=1;i<VM_PAGEOUT_PAGE_COUNT;i++) { + if( ms[i] = vm_page_lookup( object, offset+i*NBPG)) { + if((((ms[i]->flags & (PG_CLEAN|PG_INACTIVE|PG_BUSY)) == PG_INACTIVE) + || (( ms[i]->flags & PG_CLEAN) == 0 && sync == VM_PAGEOUT_FORCE)) + && (ms[i]->wire_count == 0) + && (ms[i]->hold_count == 0)) + pageout_count++; + else + break; + } else + break; + } + for(i=0;i<pageout_count;i++) { + ms[i]->flags |= PG_BUSY; + pmap_page_protect(VM_PAGE_TO_PHYS(ms[i]), VM_PROT_READ); + } + object->paging_in_progress += pageout_count; + cnt.v_pageouts += pageout_count; + } else { + + m->flags |= PG_BUSY; + + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ); + + cnt.v_pageouts++; + + object->paging_in_progress++; + + pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, + object->size, VM_PROT_ALL, 0); + if (pager != NULL) { + vm_object_setpager(object, pager, 0, FALSE); + } + } + + /* + * If there is no pager for the page, + * use the default pager. If there's + * no place to put the page at the + * moment, leave it in the laundry and + * hope that there will be paging space + * later. + */ + + if ((pager && pager->pg_type == PG_SWAP) || + cnt.v_free_count >= vm_pageout_free_min) { + if( pageout_count == 1) { + pageout_status[0] = pager ? + vm_pager_put(pager, m, + ((sync || (object == kernel_object)) ? TRUE: FALSE)) : + VM_PAGER_FAIL; + } else { + if( !pager) { + for(i=0;i<pageout_count;i++) + pageout_status[i] = VM_PAGER_FAIL; + } else { + vm_pager_put_pages(pager, ms, pageout_count, + ((sync || (object == kernel_object)) ? TRUE : FALSE), + pageout_status); + } + } + + } else { + for(i=0;i<pageout_count;i++) + pageout_status[i] = VM_PAGER_FAIL; + } + + for(i=0;i<pageout_count;i++) { + switch (pageout_status[i]) { + case VM_PAGER_OK: + ms[i]->flags &= ~PG_LAUNDRY; + ++anyok; + break; + case VM_PAGER_PEND: + ms[i]->flags &= ~PG_LAUNDRY; + ++anyok; + break; + case VM_PAGER_BAD: + /* + * Page outside of range of object. + * Right now we essentially lose the + * changes by pretending it worked. + */ + ms[i]->flags &= ~PG_LAUNDRY; + ms[i]->flags |= PG_CLEAN; + pmap_clear_modify(VM_PAGE_TO_PHYS(ms[i])); + break; + case VM_PAGER_ERROR: + case VM_PAGER_FAIL: + /* + * If page couldn't be paged out, then + * reactivate the page so it doesn't + * clog the inactive list. (We will + * try paging out it again later). + */ + if (ms[i]->flags & PG_INACTIVE) + vm_page_activate(ms[i]); + break; + case VM_PAGER_AGAIN: + break; + } + + + /* + * If the operation is still going, leave + * the page busy to block all other accesses. + * Also, leave the paging in progress + * indicator set so that we don't attempt an + * object collapse. + */ + if (pageout_status[i] != VM_PAGER_PEND) { + PAGE_WAKEUP(ms[i]); + if (--object->paging_in_progress == 0) + wakeup((caddr_t) object); + if (pmap_is_referenced(VM_PAGE_TO_PHYS(ms[i]))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(ms[i])); + if( ms[i]->flags & PG_INACTIVE) + vm_page_activate(ms[i]); + } + } + } + return anyok; +} + +/* + * vm_pageout_object_deactivate_pages + * + * deactivate enough pages to satisfy the inactive target + * requirements or if vm_page_proc_limit is set, then + * deactivate all of the pages in the object and its + * shadows. + * + * The object and map must be locked. + */ +int +vm_pageout_object_deactivate_pages(map, object, count) + vm_map_t map; + vm_object_t object; + int count; +{ + register vm_page_t p, next; + int rcount; + int s; + int dcount; + + dcount = 0; + if (count == 0) + count = 1; + + if (object->shadow) { + int scount = count; + if( object->shadow->ref_count > 1) + scount /= object->shadow->ref_count; + if( scount) + dcount += vm_pageout_object_deactivate_pages(map, object->shadow, scount); + } + + if (object->paging_in_progress) + return dcount; + + /* + * scan the objects entire memory queue + */ + rcount = object->resident_page_count; + p = object->memq.tqh_first; + while (p && (rcount-- > 0)) { + next = p->listq.tqe_next; + vm_page_lock_queues(); + /* + * if a page is active, not wired and is in the processes pmap, + * then deactivate the page. + */ + if ((p->flags & (PG_ACTIVE|PG_BUSY)) == PG_ACTIVE && + p->wire_count == 0 && + p->hold_count == 0 && + pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) { + if (!pmap_is_referenced(VM_PAGE_TO_PHYS(p))) { + p->act_count -= min(p->act_count, ACT_DECLINE); + /* + * if the page act_count is zero -- then we deactivate + */ + if (!p->act_count) { + vm_page_deactivate(p); + pmap_page_protect(VM_PAGE_TO_PHYS(p), + VM_PROT_NONE); + /* + * else if on the next go-around we will deactivate the page + * we need to place the page on the end of the queue to age + * the other pages in memory. + */ + } else { + TAILQ_REMOVE(&vm_page_queue_active, p, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); + TAILQ_REMOVE(&object->memq, p, listq); + TAILQ_INSERT_TAIL(&object->memq, p, listq); + } + /* + * see if we are done yet + */ + if (p->flags & PG_INACTIVE) { + --count; + ++dcount; + if (count <= 0 && + cnt.v_inactive_count > cnt.v_inactive_target) { + vm_page_unlock_queues(); + return dcount; + } + } + + } else { + /* + * Move the page to the bottom of the queue. + */ + pmap_clear_reference(VM_PAGE_TO_PHYS(p)); + if (p->act_count < ACT_MAX) + p->act_count += ACT_ADVANCE; + + TAILQ_REMOVE(&vm_page_queue_active, p, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); + TAILQ_REMOVE(&object->memq, p, listq); + TAILQ_INSERT_TAIL(&object->memq, p, listq); + } + } + + vm_page_unlock_queues(); + p = next; + } + return dcount; +} + + +/* + * deactivate some number of pages in a map, try to do it fairly, but + * that is really hard to do. + */ + +void +vm_pageout_map_deactivate_pages(map, entry, count, freeer) + vm_map_t map; + vm_map_entry_t entry; + int *count; + int (*freeer)(vm_map_t, vm_object_t, int); +{ + vm_map_t tmpm; + vm_map_entry_t tmpe; + vm_object_t obj; + if (*count <= 0) + return; + vm_map_reference(map); + if (!lock_try_read(&map->lock)) { + vm_map_deallocate(map); + return; + } + if (entry == 0) { + tmpe = map->header.next; + while (tmpe != &map->header && *count > 0) { + vm_pageout_map_deactivate_pages(map, tmpe, count, freeer); + tmpe = tmpe->next; + }; + } else if (entry->is_sub_map || entry->is_a_map) { + tmpm = entry->object.share_map; + tmpe = tmpm->header.next; + while (tmpe != &tmpm->header && *count > 0) { + vm_pageout_map_deactivate_pages(tmpm, tmpe, count, freeer); + tmpe = tmpe->next; + }; + } else if (obj = entry->object.vm_object) { + *count -= (*freeer)(map, obj, *count); + } + lock_read_done(&map->lock); + vm_map_deallocate(map); + return; +} + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + */ +int +vm_pageout_scan() +{ + vm_page_t m; + int page_shortage, maxscan, maxlaunder; + int pages_freed, free, nproc; + int desired_free; + vm_page_t next; + struct proc *p; + vm_object_t object; + int s; + int force_wakeup = 0; + +morefree: + /* + * scan the processes for exceeding their rlimits or if process + * is swapped out -- deactivate pages + */ + +rescanproc1: + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + vm_offset_t size; + int overage; + vm_offset_t limit; + + /* + * if this is a system process or if we have already + * looked at this process, skip it. + */ + if (p->p_flag & (P_SYSTEM|P_WEXIT)) { + continue; + } + + /* + * if the process is in a non-running type state, + * don't touch it. + */ + if (p->p_stat != SRUN && p->p_stat != SSLEEP) { + continue; + } + + /* + * get a limit + */ + limit = min(p->p_rlimit[RLIMIT_RSS].rlim_cur, + p->p_rlimit[RLIMIT_RSS].rlim_max); + + /* + * let processes that are swapped out really be swapped out + * set the limit to nothing (will force a swap-out.) + */ + if ((p->p_flag & P_INMEM) == 0) + limit = 0; + + size = p->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG; + if (size >= limit) { + overage = (size - limit) / NBPG; + vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, + (vm_map_entry_t) 0, &overage, vm_pageout_object_deactivate_pages); + } + + } + + if (((cnt.v_free_count + cnt.v_inactive_count) >= + (cnt.v_inactive_target + cnt.v_free_target)) && + (cnt.v_free_count >= cnt.v_free_target)) + return force_wakeup; + + pages_freed = 0; + desired_free = cnt.v_free_target; + + /* + * Start scanning the inactive queue for pages we can free. + * We keep scanning until we have enough free pages or + * we have scanned through the entire queue. If we + * encounter dirty pages, we start cleaning them. + */ + + maxlaunder = (cnt.v_free_target - cnt.v_free_count); + maxscan = cnt.v_inactive_count; +rescan1: + m = vm_page_queue_inactive.tqh_first; + while (m && (maxscan-- > 0) && + (cnt.v_free_count < desired_free) ) { + vm_page_t next; + + next = m->pageq.tqe_next; + + if( (m->flags & PG_INACTIVE) == 0) { + printf("vm_pageout_scan: page not inactive?"); + continue; + } + + /* + * activate held pages + */ + if (m->hold_count != 0) { + vm_page_activate(m); + m = next; + continue; + } + + /* + * dont mess with busy pages + */ + if (m->flags & PG_BUSY) { + m = next; + continue; + } + + /* + * if page is clean and but the page has been referenced, + * then reactivate the page, but if we are very low on memory + * or the page has not been referenced, then we free it to the + * vm system. + */ + if (m->flags & PG_CLEAN) { + if ((cnt.v_free_count > vm_pageout_free_min) /* XXX */ + && pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + vm_page_activate(m); + } else if (!m->act_count) { + pmap_page_protect(VM_PAGE_TO_PHYS(m), + VM_PROT_NONE); + vm_page_free(m); + ++pages_freed; + } else { + m->act_count -= min(m->act_count, ACT_DECLINE); + TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + } + } else if ((m->flags & PG_LAUNDRY) && maxlaunder > 0) { + int written; + if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + vm_page_activate(m); + m = next; + continue; + } + /* + * If a page is dirty, then it is either + * being washed (but not yet cleaned) + * or it is still in the laundry. If it is + * still in the laundry, then we start the + * cleaning operation. + */ + + if (written = vm_pageout_clean(m,0)) { + maxlaunder -= written; + } + /* + * if the next page has been re-activated, start scanning again + */ + if (next && (next->flags & PG_INACTIVE) == 0) + goto rescan1; + } else if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + vm_page_activate(m); + } + m = next; + } + + /* + * now check malloc area or swap processes out if we are in low + * memory conditions + */ + if (cnt.v_free_count <= cnt.v_free_min) { + /* + * swap out inactive processes + */ + swapout_threads(); + } + + /* + * Compute the page shortage. If we are still very low on memory + * be sure that we will move a minimal amount of pages from active + * to inactive. + */ + + page_shortage = cnt.v_inactive_target - + (cnt.v_free_count + cnt.v_inactive_count); + + if (page_shortage <= 0) { + if (pages_freed == 0) { + if( cnt.v_free_count < cnt.v_free_min) { + page_shortage = cnt.v_free_min - cnt.v_free_count; + } else if(((cnt.v_free_count + cnt.v_inactive_count) < + (cnt.v_free_min + cnt.v_inactive_target))) { + page_shortage = 1; + } else { + page_shortage = 0; + } + } + + } + + maxscan = cnt.v_active_count; + m = vm_page_queue_active.tqh_first; + while (m && maxscan-- && (page_shortage > 0)) { + + next = m->pageq.tqe_next; + + /* + * Don't deactivate pages that are busy. + */ + if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { + m = next; + continue; + } + + if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + if (m->act_count < ACT_MAX) + m->act_count += ACT_ADVANCE; + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + } else { + m->act_count -= min(m->act_count, ACT_DECLINE); + + /* + * if the page act_count is zero -- then we deactivate + */ + if (!m->act_count) { + vm_page_deactivate(m); + --page_shortage; + /* + * else if on the next go-around we will deactivate the page + * we need to place the page on the end of the queue to age + * the other pages in memory. + */ + } else { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + TAILQ_REMOVE(&m->object->memq, m, listq); + TAILQ_INSERT_TAIL(&m->object->memq, m, listq); + } + } + + m = next; + } + + /* + * if we have not freed any pages and we are desparate for memory + * then we keep trying until we get some (any) memory. + */ + + if( !force_wakeup && (swap_pager_full || !force_wakeup || + (pages_freed == 0 && (cnt.v_free_count < cnt.v_free_min)))){ + vm_pager_sync(); + force_wakeup = 1; + goto morefree; + } + vm_page_pagesfreed += pages_freed; + return force_wakeup; +} + +/* + * vm_pageout is the high level pageout daemon. + */ +void +vm_pageout() +{ + extern npendingio, swiopend; + static nowakeup; + (void) spl0(); + + /* + * Initialize some paging parameters. + */ + +vmretry: + cnt.v_free_min = 12; + cnt.v_free_reserved = 8; + if (cnt.v_free_min < 8) + cnt.v_free_min = 8; + if (cnt.v_free_min > 32) + cnt.v_free_min = 32; + vm_pageout_free_min = 4; + cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; + cnt.v_inactive_target = cnt.v_free_count / 12; + cnt.v_free_min += cnt.v_free_reserved; + + /* XXX does not really belong here */ + if (vm_page_max_wired == 0) + vm_page_max_wired = cnt.v_free_count / 3; + + + (void) swap_pager_alloc(0, 0, 0, 0); + + /* + * The pageout daemon is never done, so loop + * forever. + */ + while (TRUE) { + int force_wakeup; + extern struct loadavg averunnable; +/* + cnt.v_free_min = 12 + averunnable.ldavg[0] / 1024; + cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; + cnt.v_inactive_target = cnt.v_free_target*2; +*/ + + tsleep((caddr_t) &vm_pages_needed, PVM, "psleep", 0); + + vm_pager_sync(); + /* + * The force wakeup hack added to eliminate delays and potiential + * deadlock. It was possible for the page daemon to indefintely + * postpone waking up a process that it might be waiting for memory + * on. The putmulti stuff seems to have aggravated the situation. + */ + force_wakeup = vm_pageout_scan(); + vm_pager_sync(); + if( force_wakeup) + wakeup( (caddr_t) &cnt.v_free_count); + cnt.v_scan++; + wakeup((caddr_t) kmem_map); + } +} + diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h new file mode 100644 index 0000000..834aee5 --- /dev/null +++ b/sys/vm/vm_pageout.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Avadis Tevanian, Jr. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Header file for pageout daemon. + */ + +/* + * Exported data structures. + */ + +extern int vm_pages_needed; /* should be some "event" structure */ +simple_lock_data_t vm_pages_needed_lock; +extern int vm_pageout_pages_needed; + +#define VM_PAGEOUT_ASYNC 0 +#define VM_PAGEOUT_SYNC 1 +#define VM_PAGEOUT_FORCE 2 + +/* + * Exported routines. + */ + +/* + * Signal pageout-daemon and wait for it. + */ + +#define VM_WAIT vm_wait() + +inline static void vm_wait() { + extern struct proc *curproc, *pageproc; + int s; + s = splhigh(); + if (curproc == pageproc) { + vm_pageout_pages_needed = 1; + tsleep((caddr_t) &vm_pageout_pages_needed, PSWP, "vmwait", 0); + vm_pageout_pages_needed = 0; + } else { + wakeup((caddr_t) &vm_pages_needed); + tsleep((caddr_t) &cnt.v_free_count, PVM, "vmwait", 0); + } + splx(s); +} + + +#ifdef KERNEL +void vm_pageout __P((void)); +int vm_pageout_scan __P((void)); +void vm_pageout_page __P((vm_page_t, vm_object_t)); +void vm_pageout_cluster __P((vm_page_t, vm_object_t)); +#endif diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c new file mode 100644 index 0000000..1e4b201 --- /dev/null +++ b/sys/vm/vm_pager.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Paging space routine stubs. Emulates a matchmaker-like interface + * for builtin pagers. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +extern struct pagerops swappagerops; +extern struct pagerops vnodepagerops; +extern struct pagerops devicepagerops; + +struct pagerops *pagertab[] = { + &swappagerops, /* PG_SWAP */ + &vnodepagerops, /* PG_VNODE */ + &devicepagerops, /* PG_DEV */ +}; +int npagers = sizeof (pagertab) / sizeof (pagertab[0]); + +struct pagerops *dfltpagerops = NULL; /* default pager */ + +/* + * Kernel address space for mapping pages. + * Used by pagers where KVAs are needed for IO. + * + * XXX needs to be large enough to support the number of pending async + * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size + * (MAXPHYS == 64k) if you want to get the most efficiency. + */ +#define PAGER_MAP_SIZE (4 * 1024 * 1024) + +int pager_map_size = PAGER_MAP_SIZE; +vm_map_t pager_map; +boolean_t pager_map_wanted; +vm_offset_t pager_sva, pager_eva; + +void +vm_pager_init() +{ + struct pagerops **pgops; + + /* + * Allocate a kernel submap for tracking get/put page mappings + */ +/* + pager_map = kmem_suballoc(kernel_map, &pager_sva, &pager_eva, + PAGER_MAP_SIZE, FALSE); +*/ + /* + * Initialize known pagers + */ + for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) + if (pgops) + (*(*pgops)->pgo_init)(); + if (dfltpagerops == NULL) + panic("no default pager"); +} + +/* + * Allocate an instance of a pager of the given type. + * Size, protection and offset parameters are passed in for pagers that + * need to perform page-level validation (e.g. the device pager). + */ +vm_pager_t +vm_pager_allocate(type, handle, size, prot, off) + int type; + caddr_t handle; + vm_size_t size; + vm_prot_t prot; + vm_offset_t off; +{ + struct pagerops *ops; + + ops = (type == PG_DFLT) ? dfltpagerops : pagertab[type]; + if (ops) + return ((*ops->pgo_alloc)(handle, size, prot, off)); + return (NULL); +} + +void +vm_pager_deallocate(pager) + vm_pager_t pager; +{ + if (pager == NULL) + panic("vm_pager_deallocate: null pager"); + + (*pager->pg_ops->pgo_dealloc)(pager); +} + + +int +vm_pager_get_pages(pager, m, count, reqpage, sync) + vm_pager_t pager; + vm_page_t *m; + int count; + int reqpage; + boolean_t sync; +{ + extern boolean_t vm_page_zero_fill(); + extern int vm_pageout_count; + int i; + + if (pager == NULL) { + for (i=0;i<count;i++) { + if( i != reqpage) { + PAGE_WAKEUP(m[i]); + vm_page_free(m[i]); + } + } + vm_page_zero_fill(m[reqpage]); + return VM_PAGER_OK; + } + + if( pager->pg_ops->pgo_getpages == 0) { + for(i=0;i<count;i++) { + if( i != reqpage) { + PAGE_WAKEUP(m[i]); + vm_page_free(m[i]); + } + } + return(VM_PAGER_GET(pager, m[reqpage], sync)); + } else { + return(VM_PAGER_GET_MULTI(pager, m, count, reqpage, sync)); + } +} + +int +vm_pager_put_pages(pager, m, count, sync, rtvals) + vm_pager_t pager; + vm_page_t *m; + int count; + boolean_t sync; + int *rtvals; +{ + int i; + + if( pager->pg_ops->pgo_putpages) + return(VM_PAGER_PUT_MULTI(pager, m, count, sync, rtvals)); + else { + for(i=0;i<count;i++) { + rtvals[i] = VM_PAGER_PUT( pager, m[i], sync); + } + return rtvals[0]; + } +} + +boolean_t +vm_pager_has_page(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ + if (pager == NULL) + panic("vm_pager_has_page: null pager"); + return ((*pager->pg_ops->pgo_haspage)(pager, offset)); +} + +/* + * Called by pageout daemon before going back to sleep. + * Gives pagers a chance to clean up any completed async pageing operations. + */ +void +vm_pager_sync() +{ + struct pagerops **pgops; + + for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) + if (pgops) + (*(*pgops)->pgo_putpage)(NULL, NULL, 0); +} + +#if 0 +void +vm_pager_cluster(pager, offset, loff, hoff) + vm_pager_t pager; + vm_offset_t offset; + vm_offset_t *loff; + vm_offset_t *hoff; +{ + if (pager == NULL) + panic("vm_pager_cluster: null pager"); + return ((*pager->pg_ops->pgo_cluster)(pager, offset, loff, hoff)); +} +#endif + +vm_offset_t +vm_pager_map_page(m) + vm_page_t m; +{ + vm_offset_t kva; + + kva = kmem_alloc_wait(pager_map, PAGE_SIZE); + pmap_enter(vm_map_pmap(pager_map), kva, VM_PAGE_TO_PHYS(m), + VM_PROT_DEFAULT, TRUE); + return(kva); +} + +void +vm_pager_unmap_page(kva) + vm_offset_t kva; +{ + kmem_free_wakeup(pager_map, kva, PAGE_SIZE); +} + +vm_page_t +vm_pager_atop(kva) + vm_offset_t kva; +{ + vm_offset_t pa; + + pa = pmap_extract(vm_map_pmap(pager_map), kva); + if (pa == 0) + panic("vm_pager_atop"); + return (PHYS_TO_VM_PAGE(pa)); +} + +vm_pager_t +vm_pager_lookup(pglist, handle) + register struct pagerlst *pglist; + caddr_t handle; +{ + register vm_pager_t pager; + + for (pager = pglist->tqh_first; pager; pager = pager->pg_list.tqe_next) + if (pager->pg_handle == handle) + return (pager); + return (NULL); +} + +/* + * This routine gains a reference to the object. + * Explicit deallocation is necessary. + */ +int +pager_cache(object, should_cache) + vm_object_t object; + boolean_t should_cache; +{ + if (object == NULL) + return (KERN_INVALID_ARGUMENT); + + vm_object_cache_lock(); + vm_object_lock(object); + if (should_cache) + object->flags |= OBJ_CANPERSIST; + else + object->flags &= ~OBJ_CANPERSIST; + vm_object_unlock(object); + vm_object_cache_unlock(); + + vm_object_deallocate(object); + + return (KERN_SUCCESS); +} diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h new file mode 100644 index 0000000..3e20e50 --- /dev/null +++ b/sys/vm/vm_pager.h @@ -0,0 +1,154 @@ + +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pager.h 8.4 (Berkeley) 1/12/94 + */ + +/* + * Pager routine interface definition. + * For BSD we use a cleaner version of the internal pager interface. + */ + +#ifndef _VM_PAGER_ +#define _VM_PAGER_ + +TAILQ_HEAD(pagerlst, pager_struct); + +struct pager_struct { + TAILQ_ENTRY(pager_struct) pg_list; /* links for list management */ + caddr_t pg_handle; /* ext. handle (vp, dev, fp) */ + int pg_type; /* type of pager */ + int pg_flags; /* flags */ + struct pagerops *pg_ops; /* pager operations */ + void *pg_data; /* private pager data */ +}; + +/* pager types */ +#define PG_DFLT -1 +#define PG_SWAP 0 +#define PG_VNODE 1 +#define PG_DEVICE 2 + +/* flags */ +#define PG_CLUSTERGET 1 +#define PG_CLUSTERPUT 2 + +struct pagerops { + void (*pgo_init) /* Initialize pager. */ + __P((void)); + vm_pager_t (*pgo_alloc) /* Allocate pager. */ + __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); + void (*pgo_dealloc) /* Disassociate. */ + __P((vm_pager_t)); + int (*pgo_getpage) + __P((vm_pager_t, vm_page_t, boolean_t)); + int (*pgo_getpages) /* Get (read) page. */ + __P((vm_pager_t, vm_page_t *, int, int, boolean_t)); + int (*pgo_putpage) + __P((vm_pager_t, vm_page_t, boolean_t)); + int (*pgo_putpages) /* Put (write) page. */ + __P((vm_pager_t, vm_page_t *, int, boolean_t, int *)); + boolean_t (*pgo_haspage) /* Does pager have page? */ + __P((vm_pager_t, vm_offset_t)); +}; + +#define VM_PAGER_ALLOC(h, s, p, o) (*(pg)->pg_ops->pgo_alloc)(h, s, p, o) +#define VM_PAGER_DEALLOC(pg) (*(pg)->pg_ops->pgo_dealloc)(pg) +#define VM_PAGER_GET(pg, m, s) (*(pg)->pg_ops->pgo_getpage)(pg, m, s) +#define VM_PAGER_GET_MULTI(pg, m, c, r, s) (*(pg)->pg_ops->pgo_getpages)(pg, m, c, r, s) +#define VM_PAGER_PUT(pg, m, s) (*(pg)->pg_ops->pgo_putpage)(pg, m, s) +#define VM_PAGER_PUT_MULTI(pg, m, c, s, rtval) (*(pg)->pg_ops->pgo_putpages)(pg, m, c, s, rtval) +#define VM_PAGER_HASPAGE(pg, o) (*(pg)->pg_ops->pgo_haspage)(pg, o) + +/* + * get/put return values + * OK operation was successful + * BAD specified data was out of the accepted range + * FAIL specified data was in range, but doesn't exist + * PEND operations was initiated but not completed + * ERROR error while accessing data that is in range and exists + * AGAIN temporary resource shortage prevented operation from happening + */ +#define VM_PAGER_OK 0 +#define VM_PAGER_BAD 1 +#define VM_PAGER_FAIL 2 +#define VM_PAGER_PEND 3 +#define VM_PAGER_ERROR 4 +#define VM_PAGER_AGAIN 5 + +#ifdef KERNEL +extern struct pagerops *dfltpagerops; + +vm_pager_t vm_pager_allocate + __P((int, caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); +vm_page_t vm_pager_atop __P((vm_offset_t)); +void vm_pager_deallocate __P((vm_pager_t)); +int vm_pager_get_pages + __P((vm_pager_t, vm_page_t *, int, int, boolean_t)); +boolean_t vm_pager_has_page __P((vm_pager_t, vm_offset_t)); +void vm_pager_init __P((void)); +vm_pager_t vm_pager_lookup __P((struct pagerlst *, caddr_t)); +vm_offset_t vm_pager_map_pages __P((vm_page_t *, int, boolean_t)); +int vm_pager_put_pages + __P((vm_pager_t, vm_page_t *, int, boolean_t, int *)); +void vm_pager_sync __P((void)); +void vm_pager_unmap_pages __P((vm_offset_t, int)); + +#define vm_pager_cancluster(p, b) ((p)->pg_flags & (b)) + +/* + * XXX compat with old interface + */ +#define vm_pager_get(p, m, s) \ +({ \ + vm_page_t ml[1]; \ + ml[0] = (m); \ + vm_pager_get_pages(p, ml, 1, 0, s); \ +}) + +#define vm_pager_put(p, m, s) \ +({ \ + int rtval; \ + vm_page_t ml[1]; \ + ml[0] = (m); \ + vm_pager_put_pages(p, ml, 1, s, &rtval); \ + rtval; \ +}) +#endif + +#endif /* _VM_PAGER_ */ diff --git a/sys/vm/vm_param.h b/sys/vm/vm_param.h new file mode 100644 index 0000000..4a785ce --- /dev/null +++ b/sys/vm/vm_param.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_param.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Machine independent virtual memory parameters. + */ + +#ifndef _VM_PARAM_ +#define _VM_PARAM_ + +#include <machine/vmparam.h> + +/* + * This belongs in types.h, but breaks too many existing programs. + */ +typedef int boolean_t; +#define TRUE 1 +#define FALSE 0 + +/* + * The machine independent pages are refered to as PAGES. A page + * is some number of hardware pages, depending on the target machine. + */ +#define DEFAULT_PAGE_SIZE 4096 + +#if 0 + +/* + * All references to the size of a page should be done with PAGE_SIZE + * or PAGE_SHIFT. The fact they are variables is hidden here so that + * we can easily make them constant if we so desire. + */ +#ifndef PAGE_SIZE +#define PAGE_SIZE cnt.v_page_size /* size of page */ +#endif +#ifndef PAGE_MASK +#define PAGE_MASK page_mask /* size of page - 1 */ +#endif +#ifndef PAGE_SHIFT +#define PAGE_SHIFT page_shift /* bits to shift for pages */ +#endif + +#endif + +#ifdef KERNEL +extern vm_size_t page_mask; +extern int page_shift; +#endif + +/* + * CTL_VM identifiers + */ +#define VM_METER 1 /* struct vmmeter */ +#define VM_LOADAVG 2 /* struct loadavg */ +#define VM_MAXID 3 /* number of valid vm ids */ + +#define CTL_VM_NAMES { \ + { 0, 0 }, \ + { "vmmeter", CTLTYPE_STRUCT }, \ + { "loadavg", CTLTYPE_STRUCT }, \ +} + +/* + * Return values from the VM routines. + */ +#define KERN_SUCCESS 0 +#define KERN_INVALID_ADDRESS 1 +#define KERN_PROTECTION_FAILURE 2 +#define KERN_NO_SPACE 3 +#define KERN_INVALID_ARGUMENT 4 +#define KERN_FAILURE 5 +#define KERN_RESOURCE_SHORTAGE 6 +#define KERN_NOT_RECEIVER 7 +#define KERN_NO_ACCESS 8 + +#ifndef ASSEMBLER +/* + * Convert addresses to pages and vice versa. + * No rounding is used. + */ +#ifdef KERNEL + +#if 0 + +#ifndef atop +#define atop(x) (((unsigned)(x)) >> PAGE_SHIFT) +#endif +#ifndef ptoa +#define ptoa(x) ((vm_offset_t)((x) << PAGE_SHIFT)) +#endif + +/* + * Round off or truncate to the nearest page. These will work + * for either addresses or counts (i.e., 1 byte rounds to 1 page). + */ +#ifndef round_page +#define round_page(x) \ + ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) & ~PAGE_MASK)) +#endif +#ifndef trunc_page +#define trunc_page(x) \ + ((vm_offset_t)(((vm_offset_t)(x)) & ~PAGE_MASK)) +#endif +#ifndef num_pages +#define num_pages(x) \ + ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) >> PAGE_SHIFT)) +#endif + +#endif +#define num_pages(x) \ + ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) >> PAGE_SHIFT)) + +extern vm_size_t mem_size; /* size of physical memory (bytes) */ +extern vm_offset_t first_addr; /* first physical page */ +extern vm_offset_t last_addr; /* last physical page */ + +#else +#if 0 +/* out-of-kernel versions of round_page and trunc_page */ +#define round_page(x) \ + ((((vm_offset_t)(x) + (vm_page_size - 1)) / vm_page_size) * vm_page_size) +#define trunc_page(x) \ + ((((vm_offset_t)(x)) / vm_page_size) * vm_page_size) +#endif + +#endif /* KERNEL */ +#endif /* ASSEMBLER */ +#endif /* _VM_PARAM_ */ diff --git a/sys/vm/vm_prot.h b/sys/vm/vm_prot.h new file mode 100644 index 0000000..ee009bc --- /dev/null +++ b/sys/vm/vm_prot.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_prot.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory protection definitions. + */ + +#ifndef _VM_PROT_ +#define _VM_PROT_ + +/* + * Types defined: + * + * vm_prot_t VM protection values. + */ + +typedef u_char vm_prot_t; + +/* + * Protection values, defined as bits within the vm_prot_t type + */ + +#define VM_PROT_NONE ((vm_prot_t) 0x00) + +#define VM_PROT_READ ((vm_prot_t) 0x01) /* read permission */ +#define VM_PROT_WRITE ((vm_prot_t) 0x02) /* write permission */ +#define VM_PROT_EXECUTE ((vm_prot_t) 0x04) /* execute permission */ + +/* + * The default protection for newly-created virtual memory + */ + +#define VM_PROT_DEFAULT (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) + +/* + * The maximum privileges possible, for parameter checking. + */ + +#define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) + +#endif /* _VM_PROT_ */ diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c new file mode 100644 index 0000000..5008a09 --- /dev/null +++ b/sys/vm/vm_swap.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/dmap.h> /* XXX */ +#include <sys/vnode.h> +#include <sys/map.h> +#include <sys/file.h> + +#include <miscfs/specfs/specdev.h> + +/* + * Indirect driver for multi-controller paging. + */ + +int nswap, nswdev; +int vm_swap_size; +#ifdef SEQSWAP +int niswdev; /* number of interleaved swap devices */ +int niswap; /* size of interleaved swap area */ +#endif + +/* + * Set up swap devices. + * Initialize linked list of free swap + * headers. These do not actually point + * to buffers, but rather to pages that + * are being swapped in and out. + */ +void +swapinit() +{ + register int i; + register struct buf *sp = swbuf; + register struct proc *p = &proc0; /* XXX */ + struct swdevt *swp; + int error; + + /* + * Count swap devices, and adjust total swap space available. + * Some of the space will not be countable until later (dynamically + * configurable devices) and some of the counted space will not be + * available until a swapon() system call is issued, both usually + * happen when the system goes multi-user. + * + * If using NFS for swap, swdevt[0] will already be bdevvp'd. XXX + */ +#ifdef SEQSWAP + nswdev = niswdev = 0; + nswap = niswap = 0; + /* + * All interleaved devices must come first + */ + for (swp = swdevt; swp->sw_dev != NODEV || swp->sw_vp != NULL; swp++) { + if (swp->sw_flags & SW_SEQUENTIAL) + break; + niswdev++; + if (swp->sw_nblks > niswap) + niswap = swp->sw_nblks; + } + niswap = roundup(niswap, dmmax); + niswap *= niswdev; + if (swdevt[0].sw_vp == NULL && + bdevvp(swdevt[0].sw_dev, &swdevt[0].sw_vp)) + panic("swapvp"); + /* + * The remainder must be sequential + */ + for ( ; swp->sw_dev != NODEV; swp++) { + if ((swp->sw_flags & SW_SEQUENTIAL) == 0) + panic("binit: mis-ordered swap devices"); + nswdev++; + if (swp->sw_nblks > 0) { + if (swp->sw_nblks % dmmax) + swp->sw_nblks -= (swp->sw_nblks % dmmax); + nswap += swp->sw_nblks; + } + } + nswdev += niswdev; + if (nswdev == 0) + panic("swapinit"); + nswap += niswap; +#else + nswdev = 0; + nswap = 0; + for (swp = swdevt; swp->sw_dev != NODEV || swp->sw_vp != NULL; swp++) { + nswdev++; + if (swp->sw_nblks > nswap) + nswap = swp->sw_nblks; + } + if (nswdev == 0) + panic("swapinit"); + if (nswdev > 1) + nswap = ((nswap + dmmax - 1) / dmmax) * dmmax; + nswap *= nswdev; + if (swdevt[0].sw_vp == NULL && + bdevvp(swdevt[0].sw_dev, &swdevt[0].sw_vp)) + panic("swapvp"); +#endif + if (nswap == 0) + printf("WARNING: no swap space found\n"); + else if (error = swfree(p, 0)) { + printf("swfree errno %d\n", error); /* XXX */ + panic("swapinit swfree 0"); + } + + /* + * Now set up swap buffer headers. + */ + for (i = 0; i < nswbuf - 1; i++, sp++) { + TAILQ_INSERT_HEAD(&bswlist, sp, b_freelist); + sp->b_rcred = sp->b_wcred = p->p_ucred; + sp->b_vnbufs.le_next = NOLIST; + } + sp->b_rcred = sp->b_wcred = p->p_ucred; + sp->b_vnbufs.le_next = NOLIST; + sp->b_actf = NULL; +} + +void +swstrategy(bp) + register struct buf *bp; +{ + int sz, off, seg, index; + register struct swdevt *sp; + struct vnode *vp; + +#ifdef GENERIC + /* + * A mini-root gets copied into the front of the swap + * and we run over top of the swap area just long + * enough for us to do a mkfs and restor of the real + * root (sure beats rewriting standalone restor). + */ +#define MINIROOTSIZE 4096 + if (rootdev == dumpdev) + bp->b_blkno += MINIROOTSIZE; +#endif + sz = howmany(bp->b_bcount, DEV_BSIZE); + if (bp->b_blkno + sz > nswap) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + if (nswdev > 1) { +#ifdef SEQSWAP + if (bp->b_blkno < niswap) { + if (niswdev > 1) { + off = bp->b_blkno % dmmax; + if (off+sz > dmmax) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + seg = bp->b_blkno / dmmax; + index = seg % niswdev; + seg /= niswdev; + bp->b_blkno = seg*dmmax + off; + } else + index = 0; + } else { + register struct swdevt *swp; + + bp->b_blkno -= niswap; + for (index = niswdev, swp = &swdevt[niswdev]; + swp->sw_dev != NODEV; + swp++, index++) { + if (bp->b_blkno < swp->sw_nblks) + break; + bp->b_blkno -= swp->sw_nblks; + } + if (swp->sw_dev == NODEV || + bp->b_blkno+sz > swp->sw_nblks) { + bp->b_error = swp->sw_dev == NODEV ? + ENODEV : EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + } +#else + off = bp->b_blkno % dmmax; + if (off+sz > dmmax) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + seg = bp->b_blkno / dmmax; + index = seg % nswdev; + seg /= nswdev; + bp->b_blkno = seg*dmmax + off; +#endif + } else + index = 0; + sp = &swdevt[index]; + if ((bp->b_dev = sp->sw_dev) == NODEV) + panic("swstrategy"); + if (sp->sw_vp == NULL) { + bp->b_error = ENODEV; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + VHOLD(sp->sw_vp); + if ((bp->b_flags & B_READ) == 0) { + if (vp = bp->b_vp) { + vp->v_numoutput--; + if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t)&vp->v_numoutput); + } + } + sp->sw_vp->v_numoutput++; + } + if (bp->b_vp != NULL) + brelvp(bp); + bp->b_vp = sp->sw_vp; + VOP_STRATEGY(bp); +} + +/* + * System call swapon(name) enables swapping on device name, + * which must be in the swdevsw. Return EBUSY + * if already swapping on this device. + */ +struct swapon_args { + char *name; +}; +/* ARGSUSED */ +int +swapon(p, uap, retval) + struct proc *p; + struct swapon_args *uap; + int *retval; +{ + register struct vnode *vp; + register struct swdevt *sp; + dev_t dev; + int error; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VBLK) { + vrele(vp); + return (ENOTBLK); + } + dev = (dev_t)vp->v_rdev; + if (major(dev) >= nblkdev) { + vrele(vp); + return (ENXIO); + } + for (sp = &swdevt[0]; sp->sw_dev != NODEV; sp++) { + if (sp->sw_dev == dev) { + if (sp->sw_flags & SW_FREED) { + vrele(vp); + return (EBUSY); + } + sp->sw_vp = vp; + if (error = swfree(p, sp - swdevt)) { + vrele(vp); + return (error); + } + return (0); + } +#ifdef SEQSWAP + /* + * If we have reached a non-freed sequential device without + * finding what we are looking for, it is an error. + * That is because all interleaved devices must come first + * and sequential devices must be freed in order. + */ + if ((sp->sw_flags & (SW_SEQUENTIAL|SW_FREED)) == SW_SEQUENTIAL) + break; +#endif + } + vrele(vp); + return (EINVAL); +} + +/* + * Swfree(index) frees the index'th portion of the swap map. + * Each of the nswdev devices provides 1/nswdev'th of the swap + * space, which is laid out with blocks of dmmax pages circularly + * among the devices. + */ +int +swfree(p, index) + struct proc *p; + int index; +{ + register struct swdevt *sp; + register swblk_t vsbase; + register long blk; + struct vnode *vp; + register swblk_t dvbase; + register int nblks; + int error; + + sp = &swdevt[index]; + vp = sp->sw_vp; + if (error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)) + return (error); + sp->sw_flags |= SW_FREED; + nblks = sp->sw_nblks; + /* + * Some devices may not exist til after boot time. + * If so, their nblk count will be 0. + */ + if (nblks <= 0) { + int perdev; + dev_t dev = sp->sw_dev; + + if (bdevsw[major(dev)].d_psize == 0 || + (nblks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { + (void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); + sp->sw_flags &= ~SW_FREED; + return (ENXIO); + } +#ifdef SEQSWAP + if (index < niswdev) { + perdev = niswap / niswdev; + if (nblks > perdev) + nblks = perdev; + } else { + if (nblks % dmmax) + nblks -= (nblks % dmmax); + nswap += nblks; + } +#else + perdev = nswap / nswdev; + if (nblks > perdev) + nblks = perdev; +#endif + sp->sw_nblks = nblks; + } + if (nblks == 0) { + (void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); + sp->sw_flags &= ~SW_FREED; + return (0); /* XXX error? */ + } +#ifdef SEQSWAP + if (sp->sw_flags & SW_SEQUENTIAL) { + register struct swdevt *swp; + + blk = niswap; + for (swp = &swdevt[niswdev]; swp != sp; swp++) + blk += swp->sw_nblks; +#if 0 + rmfree(swapmap, nblks, blk); + return (0); +#endif + rlist_free(&swapmap, blk, blk + nblks - 1); + vm_swap_size += nblks; + return (0); + } +#endif + for (dvbase = 0; dvbase < nblks; dvbase += dmmax) { + blk = nblks - dvbase; + +#ifdef SEQSWAP + if ((vsbase = index*dmmax + dvbase*niswdev) >= niswap) + panic("swfree"); +#else + if ((vsbase = index*dmmax + dvbase*nswdev) >= nswap) + panic("swfree"); +#endif + if (blk > dmmax) + blk = dmmax; +#if 0 + if (vsbase == 0) { + /* + * First of all chunks... initialize the swapmap. + * Don't use the first cluster of the device + * in case it starts with a label or boot block. + */ + rminit(swapmap, blk - ctod(CLSIZE), + vsbase + ctod(CLSIZE), "swap", nswapmap); + } else if (dvbase == 0) { + /* + * Don't use the first cluster of the device + * in case it starts with a label or boot block. + */ + rmfree(swapmap, blk - ctod(CLSIZE), + vsbase + ctod(CLSIZE)); + } else + rmfree(swapmap, blk, vsbase); +#endif + /* XXX -- we need to exclude the first cluster as above */ + /* but for now, this will work fine... */ + rlist_free(&swapmap, vsbase, vsbase + blk - 1); + vm_swap_size += blk; + } + return (0); +} diff --git a/sys/vm/vm_unix.c b/sys/vm/vm_unix.c new file mode 100644 index 0000000..ee6ddf6 --- /dev/null +++ b/sys/vm/vm_unix.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: vm_unix.c 1.1 89/11/07$ + * + * @(#)vm_unix.c 8.1 (Berkeley) 6/11/93 + */ + +/* + * Traditional sbrk/grow interface to VM + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> + +#include <vm/vm.h> + +extern int swap_pager_full; + +struct obreak_args { + char *nsiz; +}; + +/* ARGSUSED */ +int +obreak(p, uap, retval) + struct proc *p; + struct obreak_args *uap; + int *retval; +{ + register struct vmspace *vm = p->p_vmspace; + vm_offset_t new, old; + int rv; + register int diff; + + old = (vm_offset_t)vm->vm_daddr; + new = round_page(uap->nsiz); + if ((int)(new - old) > p->p_rlimit[RLIMIT_DATA].rlim_cur) + return(ENOMEM); + old = round_page(old + ctob(vm->vm_dsize)); + diff = new - old; + if (diff > 0) { + if (swap_pager_full) { + return(ENOMEM); + } + rv = vm_allocate(&vm->vm_map, &old, diff, FALSE); + if (rv != KERN_SUCCESS) { + return(ENOMEM); + } + vm->vm_dsize += btoc(diff); + } else if (diff < 0) { + diff = -diff; + rv = vm_deallocate(&vm->vm_map, new, diff); + if (rv != KERN_SUCCESS) { + return(ENOMEM); + } + vm->vm_dsize -= btoc(diff); + } + return(0); +} + +struct ovadvise_args { + int anom; +}; + +/* ARGSUSED */ +int +ovadvise(p, uap, retval) + struct proc *p; + struct ovadvise_args *uap; + int *retval; +{ + + return (EINVAL); +} diff --git a/sys/vm/vm_user.c b/sys/vm/vm_user.c new file mode 100644 index 0000000..0f2c234 --- /dev/null +++ b/sys/vm/vm_user.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_user.c 8.2 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * User-exported virtual memory functions. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm.h> + +simple_lock_data_t vm_alloc_lock; /* XXX */ + +#ifdef MACHVMCOMPAT +/* + * BSD style syscall interfaces to MACH calls + * All return MACH return values. + */ +struct svm_allocate_args { + vm_map_t map; + vm_offset_t *addr; + vm_size_t size; + boolean_t anywhere; +}; +/* ARGSUSED */ +int +svm_allocate(p, uap, retval) + struct proc *p; + struct svm_allocate_args *uap; + int *retval; +{ + vm_offset_t addr; + int rv; + + uap->map = p->p_map; /* XXX */ + + if (copyin((caddr_t)uap->addr, (caddr_t)&addr, sizeof (addr))) + rv = KERN_INVALID_ARGUMENT; + else + rv = vm_allocate(uap->map, &addr, uap->size, uap->anywhere); + if (rv == KERN_SUCCESS) { + if (copyout((caddr_t)&addr, (caddr_t)uap->addr, sizeof(addr))) + rv = KERN_INVALID_ARGUMENT; + } + return((int)rv); +} + +struct svm_deallocate_args { + vm_map_t map; + vm_offset_t addr; + vm_size_t size; +}; +/* ARGSUSED */ +int +svm_deallocate(p, uap, retval) + struct proc *p; + struct svm_deallocate_args *uap; + int *retval; +{ + int rv; + + uap->map = p->p_map; /* XXX */ + rv = vm_deallocate(uap->map, uap->addr, uap->size); + return((int)rv); +} + +struct svm_inherit_args { + vm_map_t map; + vm_offset_t addr; + vm_size_t size; + vm_inherit_t inherit; +}; +/* ARGSUSED */ +int +svm_inherit(p, uap, retval) + struct proc *p; + struct svm_inherit_args *uap; + int *retval; +{ + int rv; + + uap->map = p->p_map; /* XXX */ + rv = vm_inherit(uap->map, uap->addr, uap->size, uap->inherit); + return((int)rv); +} + +struct svm_protect_args { + vm_map_t map; + vm_offset_t addr; + vm_size_t size; + boolean_t setmax; + vm_prot_t prot; +}; +/* ARGSUSED */ +int +svm_protect(p, uap, retval) + struct proc *p; + struct svm_protect_args *uap; + int *retval; +{ + int rv; + + uap->map = p->p_map; /* XXX */ + rv = vm_protect(uap->map, uap->addr, uap->size, uap->setmax, uap->prot); + return((int)rv); +} + +#endif +/* + * vm_inherit sets the inheritence of the specified range in the + * specified map. + */ +int +vm_inherit(map, start, size, new_inheritance) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; + vm_inherit_t new_inheritance; +{ + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_inherit(map, trunc_page(start), round_page(start+size), new_inheritance)); +} + +/* + * vm_protect sets the protection of the specified range in the + * specified map. + */ + +int +vm_protect(map, start, size, set_maximum, new_protection) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; + boolean_t set_maximum; + vm_prot_t new_protection; +{ + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_protect(map, trunc_page(start), round_page(start+size), new_protection, set_maximum)); +} + +/* + * vm_allocate allocates "zero fill" memory in the specfied + * map. + */ +int +vm_allocate(map, addr, size, anywhere) + register vm_map_t map; + register vm_offset_t *addr; + register vm_size_t size; + boolean_t anywhere; +{ + int result; + + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + if (size == 0) { + *addr = 0; + return(KERN_SUCCESS); + } + + if (anywhere) + *addr = vm_map_min(map); + else + *addr = trunc_page(*addr); + size = round_page(size); + + result = vm_map_find(map, NULL, (vm_offset_t) 0, addr, size, anywhere); + + return(result); +} + +/* + * vm_deallocate deallocates the specified range of addresses in the + * specified address map. + */ +int +vm_deallocate(map, start, size) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; +{ + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + if (size == (vm_offset_t) 0) + return(KERN_SUCCESS); + + return(vm_map_remove(map, trunc_page(start), round_page(start+size))); +} + +#if 1 +/* + * Similar to vm_allocate but assigns an explicit pager. + */ +int +vm_allocate_with_pager(map, addr, size, anywhere, pager, poffset, internal) + register vm_map_t map; + register vm_offset_t *addr; + register vm_size_t size; + boolean_t anywhere; + vm_pager_t pager; + vm_offset_t poffset; + boolean_t internal; +{ + register vm_object_t object; + register int result; + + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + *addr = trunc_page(*addr); + size = round_page(size); + + /* + * Lookup the pager/paging-space in the object cache. + * If it's not there, then create a new object and cache + * it. + */ + object = vm_object_lookup(pager); + cnt.v_lookups++; + if (object == NULL) { + object = vm_object_allocate(size); + /* + * From Mike Hibler: "unnamed anonymous objects should never + * be on the hash list ... For now you can just change + * vm_allocate_with_pager to not do vm_object_enter if this + * is an internal object ..." + */ + if (!internal) + vm_object_enter(object, pager); + } else + cnt.v_hits++; + if (internal) + object->flags |= OBJ_INTERNAL; + else { + object->flags &= ~OBJ_INTERNAL; + cnt.v_nzfod -= atop(size); + } + + result = vm_map_find(map, object, poffset, addr, size, anywhere); + if (result != KERN_SUCCESS) + vm_object_deallocate(object); + else if (pager != NULL) + vm_object_setpager(object, pager, (vm_offset_t) 0, TRUE); + return(result); +} +#endif diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c new file mode 100644 index 0000000..b8e5a19 --- /dev/null +++ b/sys/vm/vnode_pager.c @@ -0,0 +1,1424 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 1993,1994 John S. Dyson + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 + * $Id: vnode_pager.c,v 1.17 1994/04/05 03:23:53 davidg Exp $ + */ + +/* + * Page to/from files (vnodes). + * + * TODO: + * pageouts + * fix credential use (uses current process credentials now) + */ + +/* + * MODIFICATIONS: + * John S. Dyson 08 Dec 93 + * + * This file in conjunction with some vm_fault mods, eliminate the performance + * advantage for using the buffer cache and minimize memory copies. + * + * 1) Supports multiple - block reads + * 2) Bypasses buffer cache for reads + * + * TODO: + * + * 1) Totally bypass buffer cache for reads + * (Currently will still sometimes use buffer cache for reads) + * 2) Bypass buffer cache for writes + * (Code does not support it, but mods are simple) + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/mount.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vnode_pager.h> + +#include <sys/buf.h> +#include <miscfs/specfs/specdev.h> + +int vnode_pager_putmulti(); + +void vnode_pager_init(); +vm_pager_t vnode_pager_alloc(caddr_t, vm_offset_t, vm_prot_t, vm_offset_t); +void vnode_pager_dealloc(); +int vnode_pager_getpage(); +int vnode_pager_getmulti(); +int vnode_pager_putpage(); +boolean_t vnode_pager_haspage(); + +struct pagerops vnodepagerops = { + vnode_pager_init, + vnode_pager_alloc, + vnode_pager_dealloc, + vnode_pager_getpage, + vnode_pager_getmulti, + vnode_pager_putpage, + vnode_pager_putmulti, + vnode_pager_haspage +}; + +static int vnode_pager_input(vn_pager_t vnp, vm_page_t *m, int count, int reqpage); +static int vnode_pager_output(vn_pager_t vnp, vm_page_t *m, int count, int *rtvals); +struct buf * getpbuf() ; +void relpbuf(struct buf *bp) ; + +extern vm_map_t pager_map; + +struct pagerlst vnode_pager_list; /* list of managed vnodes */ + +#define MAXBP (PAGE_SIZE/DEV_BSIZE); + +void +vnode_pager_init() +{ + TAILQ_INIT(&vnode_pager_list); +} + +/* + * Allocate (or lookup) pager for a vnode. + * Handle is a vnode pointer. + */ +vm_pager_t +vnode_pager_alloc(handle, size, prot, offset) + caddr_t handle; + vm_size_t size; + vm_prot_t prot; + vm_offset_t offset; +{ + register vm_pager_t pager; + register vn_pager_t vnp; + vm_object_t object; + struct vattr vattr; + struct vnode *vp; + struct proc *p = curproc; /* XXX */ + + /* + * Pageout to vnode, no can do yet. + */ + if (handle == NULL) + return(NULL); + + /* + * Vnodes keep a pointer to any associated pager so no need to + * lookup with vm_pager_lookup. + */ + vp = (struct vnode *)handle; + pager = (vm_pager_t)vp->v_vmdata; + if (pager == NULL) { + /* + * Allocate pager structures + */ + pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, M_WAITOK); + if (pager == NULL) + return(NULL); + vnp = (vn_pager_t)malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK); + if (vnp == NULL) { + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + /* + * And an object of the appropriate size + */ + if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0) { + object = vm_object_allocate(round_page(vattr.va_size)); + vm_object_enter(object, pager); + vm_object_setpager(object, pager, 0, TRUE); + } else { + free((caddr_t)vnp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + /* + * Hold a reference to the vnode and initialize pager data. + */ + VREF(vp); + vnp->vnp_flags = 0; + vnp->vnp_vp = vp; + vnp->vnp_size = vattr.va_size; + + TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list); + pager->pg_handle = handle; + pager->pg_type = PG_VNODE; + pager->pg_ops = &vnodepagerops; + pager->pg_data = (caddr_t)vnp; + vp->v_vmdata = (caddr_t)pager; + } else { + /* + * vm_object_lookup() will remove the object from the + * cache if found and also gain a reference to the object. + */ + object = vm_object_lookup(pager); + } + return(pager); +} + +void +vnode_pager_dealloc(pager) + vm_pager_t pager; +{ + register vn_pager_t vnp = (vn_pager_t)pager->pg_data; + register struct vnode *vp; + struct proc *p = curproc; /* XXX */ + + if (vp = vnp->vnp_vp) { + vp->v_vmdata = NULL; + vp->v_flag &= ~VTEXT; +#if 0 + /* can hang if done at reboot on NFS FS */ + (void) VOP_FSYNC(vp, p->p_ucred, p); +#endif + vrele(vp); + } + + TAILQ_REMOVE(&vnode_pager_list, pager, pg_list); + free((caddr_t)vnp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); +} + +int +vnode_pager_getmulti(pager, m, count, reqpage, sync) + vm_pager_t pager; + vm_page_t *m; + int count; + int reqpage; + boolean_t sync; +{ + + return vnode_pager_input((vn_pager_t) pager->pg_data, m, count, reqpage); +} + +int +vnode_pager_getpage(pager, m, sync) + vm_pager_t pager; + vm_page_t m; + boolean_t sync; +{ + + int err; + vm_page_t marray[1]; + if (pager == NULL) + return FALSE; + marray[0] = m; + + return vnode_pager_input((vn_pager_t)pager->pg_data, marray, 1, 0); +} + +boolean_t +vnode_pager_putpage(pager, m, sync) + vm_pager_t pager; + vm_page_t m; + boolean_t sync; +{ + int err; + vm_page_t marray[1]; + int rtvals[1]; + + if (pager == NULL) + return FALSE; + marray[0] = m; + vnode_pager_output((vn_pager_t)pager->pg_data, marray, 1, rtvals); + return rtvals[0]; +} + +int +vnode_pager_putmulti(pager, m, c, sync, rtvals) + vm_pager_t pager; + vm_page_t *m; + int c; + boolean_t sync; + int *rtvals; +{ + return vnode_pager_output((vn_pager_t)pager->pg_data, m, c, rtvals); +} + + +boolean_t +vnode_pager_haspage(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ + register vn_pager_t vnp = (vn_pager_t)pager->pg_data; + daddr_t bn; + int run; + int err; + + /* + * Offset beyond end of file, do not have the page + */ + if (offset >= vnp->vnp_size) { + return(FALSE); + } + + /* + * Read the index to find the disk block to read + * from. If there is no block, report that we don't + * have this data. + * + * Assumes that the vnode has whole page or nothing. + */ + err = VOP_BMAP(vnp->vnp_vp, + offset / vnp->vnp_vp->v_mount->mnt_stat.f_iosize, + (struct vnode **)0, &bn, 0); + if (err) { + return(TRUE); + } + return((long)bn < 0 ? FALSE : TRUE); +} + +/* + * Lets the VM system know about a change in size for a file. + * If this vnode is mapped into some address space (i.e. we have a pager + * for it) we adjust our own internal size and flush any cached pages in + * the associated object that are affected by the size change. + * + * Note: this routine may be invoked as a result of a pager put + * operation (possibly at object termination time), so we must be careful. + */ +void +vnode_pager_setsize(vp, nsize) + struct vnode *vp; + u_long nsize; +{ + register vn_pager_t vnp; + register vm_object_t object; + vm_pager_t pager; + + /* + * Not a mapped vnode + */ + if (vp == NULL || vp->v_type != VREG || vp->v_vmdata == NULL) + return; + /* + * Hasn't changed size + */ + pager = (vm_pager_t)vp->v_vmdata; + vnp = (vn_pager_t)pager->pg_data; + if (nsize == vnp->vnp_size) + return; + /* + * No object. + * This can happen during object termination since + * vm_object_page_clean is called after the object + * has been removed from the hash table, and clean + * may cause vnode write operations which can wind + * up back here. + */ + object = vm_object_lookup(pager); + if (object == NULL) + return; + + /* + * File has shrunk. + * Toss any cached pages beyond the new EOF. + */ + if (round_page(nsize) < round_page(vnp->vnp_size)) { + vm_object_lock(object); + vm_object_page_remove(object, + (vm_offset_t)round_page(nsize), round_page(vnp->vnp_size)); + vm_object_unlock(object); + } + vnp->vnp_size = (vm_offset_t)nsize; + vm_object_deallocate(object); +} + +void +vnode_pager_umount(mp) + register struct mount *mp; +{ + register vm_pager_t pager, npager; + struct vnode *vp; + + pager = vnode_pager_list.tqh_first; + while( pager) { + /* + * Save the next pointer now since uncaching may + * terminate the object and render pager invalid + */ + vp = ((vn_pager_t)pager->pg_data)->vnp_vp; + npager = pager->pg_list.tqe_next; + if (mp == (struct mount *)0 || vp->v_mount == mp) + (void) vnode_pager_uncache(vp); + pager = npager; + } +} + +/* + * Remove vnode associated object from the object cache. + * + * Note: this routine may be invoked as a result of a pager put + * operation (possibly at object termination time), so we must be careful. + */ +boolean_t +vnode_pager_uncache(vp) + register struct vnode *vp; +{ + register vm_object_t object; + boolean_t uncached, locked; + vm_pager_t pager; + + /* + * Not a mapped vnode + */ + pager = (vm_pager_t)vp->v_vmdata; + if (pager == NULL) + return (TRUE); + /* + * Unlock the vnode if it is currently locked. + * We do this since uncaching the object may result + * in its destruction which may initiate paging + * activity which may necessitate locking the vnode. + */ + locked = VOP_ISLOCKED(vp); + if (locked) + VOP_UNLOCK(vp); + /* + * Must use vm_object_lookup() as it actually removes + * the object from the cache list. + */ + object = vm_object_lookup(pager); + if (object) { + uncached = (object->ref_count <= 1); + pager_cache(object, FALSE); + } else + uncached = TRUE; + if (locked) + VOP_LOCK(vp); + return(uncached); +} +#if 0 +/* + * Remove vnode associated object from the object cache. + * + * XXX unlock the vnode if it is currently locked. + * We must do this since uncaching the object may result in its + * destruction which may initiate paging activity which may necessitate + * re-locking the vnode. + */ +boolean_t +vnode_pager_uncache(vp) + register struct vnode *vp; +{ + register vm_object_t object; + boolean_t uncached; + vm_pager_t pager; + + /* + * Not a mapped vnode + */ + pager = (vm_pager_t)vp->v_vmdata; + if (pager == NULL) + return (TRUE); + /* + * Must use vm_object_lookup() as it actually removes + * the object from the cache list. + */ + object = vm_object_lookup(pager); + if (object) { + uncached = (object->ref_count <= 1); + VOP_UNLOCK(vp); + pager_cache(object, FALSE); + VOP_LOCK(vp); + } else + uncached = TRUE; + return(uncached); +} +#endif + + +void +vnode_pager_freepage(m) + vm_page_t m; +{ + PAGE_WAKEUP(m); + vm_page_free(m); +} + +/* + * calculate the linear (byte) disk address of specified virtual + * file address + */ +vm_offset_t +vnode_pager_addr(vp, address) + struct vnode *vp; + vm_offset_t address; +{ + int rtaddress; + int bsize; + vm_offset_t block; + struct vnode *rtvp; + int err; + int vblock, voffset; + int run; + + bsize = vp->v_mount->mnt_stat.f_iosize; + vblock = address / bsize; + voffset = address % bsize; + + err = VOP_BMAP(vp,vblock,&rtvp,&block,0); + + if( err) + rtaddress = -1; + else + rtaddress = block * DEV_BSIZE + voffset; + + return rtaddress; +} + +/* + * interrupt routine for I/O completion + */ +void +vnode_pager_iodone(bp) + struct buf *bp; +{ + bp->b_flags |= B_DONE; + wakeup((caddr_t)bp); +} + +/* + * small block file system vnode pager input + */ +int +vnode_pager_input_smlfs(vnp, m) + vn_pager_t vnp; + vm_page_t m; +{ + int i; + int s; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + struct buf *bp; + vm_offset_t mapsize; + vm_offset_t foff; + vm_offset_t kva; + int fileaddr; + int block; + vm_offset_t bsize; + int error = 0; + int run; + + paging_offset = m->object->paging_offset; + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + foff = m->offset + paging_offset; + + VOP_BMAP(vp, foff, &dp, 0, 0); + + kva = vm_pager_map_page(m); + + for(i=0;i<PAGE_SIZE/bsize;i++) { + /* + * calculate logical block and offset + */ + block = foff / bsize + i; + s = splbio(); + while (bp = incore(vp, block)) { + int amount; + + /* + * wait until the buffer is avail or gone + */ + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep ((caddr_t)bp, PVM, "vnwblk", 0); + continue; + } + + amount = bsize; + if ((foff + bsize) > vnp->vnp_size) + amount = vnp->vnp_size - foff; + + /* + * make sure that this page is in the buffer + */ + if ((amount > 0) && amount <= bp->b_bcount) { + bp->b_flags |= B_BUSY; + splx(s); + + /* + * copy the data from the buffer + */ + bcopy(bp->b_un.b_addr, (caddr_t)kva + i * bsize, amount); + if (amount < bsize) { + bzero((caddr_t)kva + amount, bsize - amount); + } + bp->b_flags &= ~B_BUSY; + wakeup((caddr_t)bp); + goto nextblock; + } + break; + } + splx(s); + fileaddr = vnode_pager_addr(vp, foff + i * bsize); + if( fileaddr != -1) { + bp = getpbuf(); + VHOLD(vp); + + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_READ | B_CALL; + bp->b_iodone = vnode_pager_iodone; + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva + i * bsize; + bp->b_blkno = fileaddr / DEV_BSIZE; + bgetvp(dp, bp); + bp->b_bcount = bsize; + bp->b_bufsize = bsize; + + /* do the input */ + VOP_STRATEGY(bp); + + /* we definitely need to be at splbio here */ + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnsrd", 0); + } + splx(s); + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + if( error) + break; + } else { + bzero((caddr_t) kva + i * bsize, bsize); + } +nextblock: + } + vm_pager_unmap_page(kva); + if( error) { + return VM_PAGER_FAIL; + } + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + m->flags |= PG_CLEAN; + m->flags &= ~PG_LAUNDRY; + return VM_PAGER_OK; + +} + + +/* + * old style vnode pager output routine + */ +int +vnode_pager_input_old(vnp, m) + vn_pager_t vnp; + vm_page_t m; +{ + int i; + struct uio auio; + struct iovec aiov; + int error; + int size; + vm_offset_t foff; + vm_offset_t kva; + + error = 0; + foff = m->offset + m->object->paging_offset; + /* + * Return failure if beyond current EOF + */ + if (foff >= vnp->vnp_size) { + return VM_PAGER_BAD; + } else { + size = PAGE_SIZE; + if (foff + size > vnp->vnp_size) + size = vnp->vnp_size - foff; +/* + * Allocate a kernel virtual address and initialize so that + * we can use VOP_READ/WRITE routines. + */ + kva = vm_pager_map_page(m); + aiov.iov_base = (caddr_t)kva; + aiov.iov_len = size; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = foff; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_resid = size; + auio.uio_procp = (struct proc *)0; + + error = VOP_READ(vnp->vnp_vp, &auio, 0, curproc->p_ucred); + if (!error) { + register int count = size - auio.uio_resid; + + if (count == 0) + error = EINVAL; + else if (count != PAGE_SIZE) + bzero((caddr_t)kva + count, PAGE_SIZE - count); + } + vm_pager_unmap_page(kva); + } + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + m->flags |= PG_CLEAN; + m->flags &= ~PG_LAUNDRY; + return error?VM_PAGER_FAIL:VM_PAGER_OK; +} + +/* + * generic vnode pager input routine + */ +int +vnode_pager_input(vnp, m, count, reqpage) + register vn_pager_t vnp; + vm_page_t *m; + int count, reqpage; +{ + int i,j; + vm_offset_t kva, foff; + int size; + struct proc *p = curproc; /* XXX */ + vm_object_t object; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + vm_offset_t mapsize; + int bsize; + + int first, last; + int reqaddr, firstaddr; + int run; + int block, offset; + + int nbp; + struct buf *bp; + int s; + int failflag; + + int errtype=0; /* 0 is file type otherwise vm type */ + int error = 0; + + object = m[reqpage]->object; /* all vm_page_t items are in same object */ + paging_offset = object->paging_offset; + + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + + /* get the UNDERLYING device for the file with VOP_BMAP() */ + /* + * originally, we did not check for an error return + * value -- assuming an fs always has a bmap entry point + * -- that assumption is wrong!!! + */ + kva = 0; + mapsize = 0; + foff = m[reqpage]->offset + paging_offset; + if (!VOP_BMAP(vp, foff, &dp, 0, 0)) { + /* + * we do not block for a kva, notice we default to a kva + * conservative behavior + */ + kva = kmem_alloc_pageable(pager_map, (mapsize = count*PAGE_SIZE)); + if( !kva) { + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + m[0] = m[reqpage]; + kva = kmem_alloc_wait(pager_map, mapsize = PAGE_SIZE); + reqpage = 0; + count = 1; + } + } + + /* + * if we can't get a kva or we can't bmap, use old VOP code + */ + if (!kva) { + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + return vnode_pager_input_old(vnp, m[reqpage]); + /* + * if the blocksize is smaller than a page size, then use + * special small filesystem code. NFS sometimes has a small + * blocksize, but it can handle large reads itself. + */ + } else if( (PAGE_SIZE / bsize) > 1 && + (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { + + kmem_free_wakeup(pager_map, kva, mapsize); + + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + return vnode_pager_input_smlfs(vnp, m[reqpage]); + } + +/* + * here on direct device I/O + */ + + + /* + * This pathetic hack gets data from the buffer cache, if it's there. + * I believe that this is not really necessary, and the ends can + * be gotten by defaulting to the normal vfs read behavior, but this + * might be more efficient, because the will NOT invoke read-aheads + * and one of the purposes of this code is to bypass the buffer + * cache and keep from flushing it by reading in a program. + */ + /* + * calculate logical block and offset + */ + block = foff / bsize; + offset = foff % bsize; + s = splbio(); + + /* + * if we have a buffer in core, then try to use it + */ + while (bp = incore(vp, block)) { + int amount; + + /* + * wait until the buffer is avail or gone + */ + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep ((caddr_t)bp, PVM, "vnwblk", 0); + continue; + } + + amount = PAGE_SIZE; + if ((foff + amount) > vnp->vnp_size) + amount = vnp->vnp_size - foff; + + /* + * make sure that this page is in the buffer + */ + if ((amount > 0) && (offset + amount) <= bp->b_bcount) { + bp->b_flags |= B_BUSY; + splx(s); + + /* + * map the requested page + */ + pmap_kenter(kva, VM_PAGE_TO_PHYS(m[reqpage])); + pmap_update(); + + /* + * copy the data from the buffer + */ + bcopy(bp->b_un.b_addr + offset, (caddr_t)kva, amount); + if (amount < PAGE_SIZE) { + bzero((caddr_t)kva + amount, PAGE_SIZE - amount); + } + /* + * unmap the page and free the kva + */ + pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE); + kmem_free_wakeup(pager_map, kva, mapsize); + /* + * release the buffer back to the block subsystem + */ + bp->b_flags &= ~B_BUSY; + wakeup((caddr_t)bp); + /* + * we did not have to do any work to get the requested + * page, the read behind/ahead does not justify a read + */ + for (i = 0; i < count; i++) { + if (i != reqpage) { + vnode_pager_freepage(m[i]); + } + } + count = 1; + reqpage = 0; + m[0] = m[reqpage]; + + /* + * sorry for the goto + */ + goto finishup; + } + /* + * buffer is nowhere to be found, read from the disk + */ + break; + } + splx(s); + + reqaddr = vnode_pager_addr(vp, foff); + s = splbio(); + /* + * Make sure that our I/O request is contiguous. + * Scan backward and stop for the first discontiguous + * entry or stop for a page being in buffer cache. + */ + failflag = 0; + first = reqpage; + for (i = reqpage - 1; i >= 0; --i) { + if (failflag || + incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || + (vnode_pager_addr(vp, m[i]->offset + paging_offset)) + != reqaddr + (i - reqpage) * PAGE_SIZE) { + vnode_pager_freepage(m[i]); + failflag = 1; + } else { + first = i; + } + } + + /* + * Scan forward and stop for the first non-contiguous + * entry or stop for a page being in buffer cache. + */ + failflag = 0; + last = reqpage + 1; + for (i = reqpage + 1; i < count; i++) { + if (failflag || + incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || + (vnode_pager_addr(vp, m[i]->offset + paging_offset)) + != reqaddr + (i - reqpage) * PAGE_SIZE) { + vnode_pager_freepage(m[i]); + failflag = 1; + } else { + last = i + 1; + } + } + splx(s); + + /* + * the first and last page have been calculated now, move input + * pages to be zero based... + */ + count = last; + if (first != 0) { + for (i = first; i < count; i++) { + m[i - first] = m[i]; + } + count -= first; + reqpage -= first; + } + + /* + * calculate the file virtual address for the transfer + */ + foff = m[0]->offset + paging_offset; + /* + * and get the disk physical address (in bytes) + */ + firstaddr = vnode_pager_addr(vp, foff); + + /* + * calculate the size of the transfer + */ + size = count * PAGE_SIZE; + if ((foff + size) > vnp->vnp_size) + size = vnp->vnp_size - foff; + + /* + * round up physical size for real devices + */ + if( dp->v_type == VBLK || dp->v_type == VCHR) + size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + /* + * and map the pages to be read into the kva + */ + for (i = 0; i < count; i++) + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + + pmap_update(); + bp = getpbuf(); + VHOLD(vp); + + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_READ | B_CALL; + bp->b_iodone = vnode_pager_iodone; + /* B_PHYS is not set, but it is nice to fill this in */ + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = firstaddr / DEV_BSIZE; + bgetvp(dp, bp); + bp->b_bcount = size; + bp->b_bufsize = size; + + /* do the input */ + VOP_STRATEGY(bp); + + s = splbio(); + /* we definitely need to be at splbio here */ + + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnread", 0); + } + splx(s); + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + if (!error) { + if (size != count * PAGE_SIZE) + bzero((caddr_t)kva + size, PAGE_SIZE * count - size); + } + + pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); + kmem_free_wakeup(pager_map, kva, mapsize); + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + +finishup: + for (i = 0; i < count; i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + if (i != reqpage) { + /* + * whether or not to leave the page activated + * is up in the air, but we should put the page + * on a page queue somewhere. (it already is in + * the object). + * Result: It appears that emperical results show + * that deactivating pages is best. + */ + /* + * just in case someone was asking for this + * page we now tell them that it is ok to use + */ + if (!error) { + vm_page_deactivate(m[i]); + PAGE_WAKEUP(m[i]); + m[i]->flags &= ~PG_FAKE; + m[i]->act_count = 2; + } else { + vnode_pager_freepage(m[i]); + } + } + } + if (error) { + printf("vnode pager read error: %d\n", error); + } + if (errtype) + return error; + return (error ? VM_PAGER_FAIL : VM_PAGER_OK); +} + +/* + * old-style vnode pager output routine + */ +int +vnode_pager_output_old(vnp, m) + register vn_pager_t vnp; + vm_page_t m; +{ + vm_offset_t foff; + vm_offset_t kva; + vm_offset_t size; + struct iovec aiov; + struct uio auio; + struct vnode *vp; + int error; + + vp = vnp->vnp_vp; + foff = m->offset + m->object->paging_offset; + /* + * Return failure if beyond current EOF + */ + if (foff >= vnp->vnp_size) { + return VM_PAGER_BAD; + } else { + size = PAGE_SIZE; + if (foff + size > vnp->vnp_size) + size = vnp->vnp_size - foff; +/* + * Allocate a kernel virtual address and initialize so that + * we can use VOP_WRITE routines. + */ + kva = vm_pager_map_page(m); + aiov.iov_base = (caddr_t)kva; + aiov.iov_len = size; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = foff; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_resid = size; + auio.uio_procp = (struct proc *)0; + + error = VOP_WRITE(vp, &auio, 0, curproc->p_ucred); + + if (!error) { + if ((size - auio.uio_resid) == 0) { + error = EINVAL; + } + } + vm_pager_unmap_page(kva); + return error?VM_PAGER_FAIL:VM_PAGER_OK; + } +} + +/* + * vnode pager output on a small-block file system + */ +int +vnode_pager_output_smlfs(vnp, m) + vn_pager_t vnp; + vm_page_t m; +{ + int i; + int s; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + struct buf *bp; + vm_offset_t mapsize; + vm_offset_t foff; + vm_offset_t kva; + int fileaddr; + int block; + vm_offset_t bsize; + int run; + int error = 0; + + paging_offset = m->object->paging_offset; + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + foff = m->offset + paging_offset; + + VOP_BMAP(vp, foff, &dp, 0, 0); + kva = vm_pager_map_page(m); + for(i = 0; !error && i < (PAGE_SIZE/bsize); i++) { + /* + * calculate logical block and offset + */ + fileaddr = vnode_pager_addr(vp, foff + i * bsize); + if( fileaddr != -1) { + s = splbio(); + if( bp = incore( vp, (foff/bsize) + i)) { + bp = getblk(vp, (foff/bsize) + i, bp->b_bufsize,0, 0); + bp->b_flags |= B_INVAL; + brelse(bp); + } + splx(s); + + bp = getpbuf(); + VHOLD(vp); + + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_CALL | B_WRITE; + bp->b_iodone = vnode_pager_iodone; + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva + i * bsize; + bp->b_blkno = fileaddr / DEV_BSIZE; + bgetvp(dp, bp); + ++dp->v_numoutput; + /* for NFS */ + bp->b_dirtyoff = 0; + bp->b_dirtyend = bsize; + bp->b_bcount = bsize; + bp->b_bufsize = bsize; + + /* do the input */ + VOP_STRATEGY(bp); + + /* we definitely need to be at splbio here */ + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnswrt", 0); + } + splx(s); + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + } + } + vm_pager_unmap_page(kva); + if( error) + return VM_PAGER_FAIL; + else + return VM_PAGER_OK; +} + +/* + * generic vnode pager output routine + */ +int +vnode_pager_output(vnp, m, count, rtvals) + vn_pager_t vnp; + vm_page_t *m; + int count; + int *rtvals; +{ + int i,j; + vm_offset_t kva, foff; + int size; + struct proc *p = curproc; /* XXX */ + vm_object_t object; + vm_offset_t paging_offset; + struct vnode *dp, *vp; + struct buf *bp; + vm_offset_t mapsize; + vm_offset_t reqaddr; + int run; + int bsize; + int s; + + int error = 0; + +retryoutput: + object = m[0]->object; /* all vm_page_t items are in same object */ + paging_offset = object->paging_offset; + + vp = vnp->vnp_vp; + bsize = vp->v_mount->mnt_stat.f_iosize; + + for(i=0;i<count;i++) + rtvals[i] = VM_PAGER_AGAIN; + + /* + * if the filesystem does not have a bmap, then use the + * old code + */ + if (VOP_BMAP(vp, m[0]->offset+paging_offset, &dp, 0, 0)) { + + rtvals[0] = vnode_pager_output_old(vnp, m[0]); + + pmap_clear_modify(VM_PAGE_TO_PHYS(m[0])); + m[0]->flags |= PG_CLEAN; + m[0]->flags &= ~PG_LAUNDRY; + return rtvals[0]; + } + + /* + * if the filesystem has a small blocksize, then use + * the small block filesystem output code + */ + if ((bsize < PAGE_SIZE) && + (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { + + for(i=0;i<count;i++) { + rtvals[i] = vnode_pager_output_smlfs(vnp, m[i]); + if( rtvals[i] == VM_PAGER_OK) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + } + } + return rtvals[0]; + } + + /* + * get some kva for the output + */ + kva = kmem_alloc_pageable(pager_map, (mapsize = count*PAGE_SIZE)); + if( !kva) { + kva = kmem_alloc_pageable(pager_map, (mapsize = PAGE_SIZE)); + count = 1; + if( !kva) + return rtvals[0]; + } + + for(i=0;i<count;i++) { + foff = m[i]->offset + paging_offset; + if (foff >= vnp->vnp_size) { + for(j=i;j<count;j++) + rtvals[j] = VM_PAGER_BAD; + count = i; + break; + } + } + if (count == 0) { + return rtvals[0]; + } + foff = m[0]->offset + paging_offset; + reqaddr = vnode_pager_addr(vp, foff); + /* + * Scan forward and stop for the first non-contiguous + * entry or stop for a page being in buffer cache. + */ + for (i = 1; i < count; i++) { + if ( vnode_pager_addr(vp, m[i]->offset + paging_offset) + != reqaddr + i * PAGE_SIZE) { + count = i; + break; + } + } + + /* + * calculate the size of the transfer + */ + size = count * PAGE_SIZE; + if ((foff + size) > vnp->vnp_size) + size = vnp->vnp_size - foff; + + /* + * round up physical size for real devices + */ + if( dp->v_type == VBLK || dp->v_type == VCHR) + size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + + /* + * and map the pages to be read into the kva + */ + for (i = 0; i < count; i++) + pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + pmap_update(); +/* + printf("vnode: writing foff: %d, devoff: %d, size: %d\n", + foff, reqaddr, size); +*/ + /* + * next invalidate the incore vfs_bio data + */ + for (i = 0; i < count; i++) { + int filblock = (foff + i * PAGE_SIZE) / bsize; + struct buf *fbp; + + s = splbio(); + if( fbp = incore( vp, filblock)) { + /* printf("invalidating: %d\n", filblock); */ + fbp = getblk(vp, filblock, fbp->b_bufsize,0,0); + fbp->b_flags |= B_INVAL; + brelse(fbp); + } + splx(s); + } + + + bp = getpbuf(); + VHOLD(vp); + /* build a minimal buffer header */ + bp->b_flags = B_BUSY | B_WRITE | B_CALL; + bp->b_iodone = vnode_pager_iodone; + /* B_PHYS is not set, but it is nice to fill this in */ + bp->b_proc = curproc; + bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; + + if( bp->b_rcred != NOCRED) + crhold(bp->b_rcred); + if( bp->b_wcred != NOCRED) + crhold(bp->b_wcred); + bp->b_un.b_addr = (caddr_t) kva; + bp->b_blkno = reqaddr / DEV_BSIZE; + bgetvp(dp, bp); + ++dp->v_numoutput; + + /* for NFS */ + bp->b_dirtyoff = 0; + bp->b_dirtyend = size; + + bp->b_bcount = size; + bp->b_bufsize = size; + + /* do the output */ + VOP_STRATEGY(bp); + + s = splbio(); + + /* we definitely need to be at splbio here */ + + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PVM, "vnwrite", 0); + } + splx(s); + + if ((bp->b_flags & B_ERROR) != 0) + error = EIO; + + pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); + kmem_free_wakeup(pager_map, kva, mapsize); + + /* + * free the buffer header back to the swap buffer pool + */ + relpbuf(bp); + HOLDRELE(vp); + + if( !error) { + for(i=0;i<count;i++) { + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); + m[i]->flags |= PG_CLEAN; + m[i]->flags &= ~PG_LAUNDRY; + rtvals[i] = VM_PAGER_OK; + } + } else if( count != 1) { + error = 0; + count = 1; + goto retryoutput; + } + + if (error) { + printf("vnode pager write error: %d\n", error); + } + return (error ? VM_PAGER_FAIL : VM_PAGER_OK); +} + diff --git a/sys/vm/vnode_pager.h b/sys/vm/vnode_pager.h new file mode 100644 index 0000000..b01dc54 --- /dev/null +++ b/sys/vm/vnode_pager.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vnode_pager.h 8.1 (Berkeley) 6/11/93 + */ + +#ifndef _VNODE_PAGER_ +#define _VNODE_PAGER_ 1 + +/* + * VNODE pager private data. + */ +struct vnpager { + int vnp_flags; /* flags */ + struct vnode *vnp_vp; /* vnode */ + vm_size_t vnp_size; /* vnode current size */ +}; +typedef struct vnpager *vn_pager_t; + +#define VN_PAGER_NULL ((vn_pager_t)0) + +#endif /* _VNODE_PAGER_ */ |